← Back to team overview

maria-developers team mailing list archive

Re: uint6korr optimization

 

Hi!

>>>>> "Alexey" == Alexey Botchkov <holyfoot@xxxxxxxxxxxx> writes:

Alexey> Hi, Monty.
Alexey> I've looked at that part of the code, and tried to make the improved
Alexey> versions of uint6korr and mi_uint6korr.

Alexey> The small program attached that benchmarks these.
Alexey> Depending on enabled macros it loops uint6korr, hf_uint6korr,
Alexey> mi_uint6korr and hf_mi_uint6korr respectively. It performs 2 loops on
Alexey> each functions first runs it once, the second - twice, so we can
Alexey> calculate how much time was spent on the operation itself.

Alexey> The results i got so far are:
Alexey> elapsed 103 seconds on korr6-1
Alexey> elapsed 190 seconds on korr6-2
Alexey> elapsed 50 seconds on hf_korr6-1
Alexey> elapsed 79 seconds on hf_korr6-2
Alexey> elapsed 106 seconds on mi6-1
Alexey> elapsed 195 seconds on mi6-2
Alexey> elapsed 56 seconds on hf_mi6B-1
Alexey> elapsed 88 seconds on hf_mi6-2

Alexey> So the
Alexey>       hf_uint6korr is 3 times faster than uint6korr.
Alexey>       hf_mi_uint6korr is 2.8 times faster than mi_uint6korr.

Alexey> You're welcome to check the code out.

Thanks.

What is important is to get fast versions of

mi_uint3korr  (Used a lot in ma_dynrec.c and gis)
mi_uint4korr
mi_uint5korr
mi_uint6korr
mi_uint7korr
mi_uint8korr  (Used for some variables)

uint5korr
uint6korr     (Used a lot in Aria)
uint8korr

(I am including the full test so that anyone can comment upon this)

----------------------------------------------------------------------
#include <stdio.h>
#include <time.h>
#include <malloc.h>

#define TEST_KORR6
#define TEST_HF_KORR6
#define TEST_MI6
#define TEST_HF_MI6

#define uint6korr(A)    ((ulonglong)(((uint32)    ((uchar) (A)[0]))          + \
                                           (((uint32)    ((uchar) (A)[1])) << 8)   + \
                                           (((uint32)    ((uchar) (A)[2])) << 16)  + \
                                           (((uint32)    ((uchar) (A)[3])) << 24)) + \
                             (((ulonglong) ((uchar) (A)[4])) << 32) +       \
                             (((ulonglong) ((uchar) (A)[5])) << 40))
#define mi_uint6korr(A) ((ulonglong)(((uint32) (((const uchar*) (A))[5])) +\
                                          (((uint32) (((const uchar*) (A))[4])) << 8) +\
                                          (((uint32) (((const uchar*) (A))[3])) << 16) +\
                                          (((uint32) (((const uchar*) (A))[2])) << 24)) +\
                            (((ulonglong) (((uint32) (((const uchar*) (A))[1])) +\
                                                                               (((uint32) (((const uchar*) (A))[0]) << 8)))) <<\
                                                                  32))

#define hf_uint6korr(A) (((ulonglong) ((uint32 *) (A))[0]) + (((ulonglong) ((uint16 *) (A))[2]) << 32))
#define hf_mi_uint6korr(src, dest) \
    __asm__ (                   \
      "bswapq %1;"                      \
      "mov %1, %0;"                     \
      :"=r"(dest)                     \
      :"r"(hf_uint6korr(src)<<16)                    \
      :                 \
    )

typedef unsigned long long int ulonglong;
typedef unsigned int uint32;
typedef unsigned char uchar;
typedef unsigned short uint16;

time_t t0, t1;
ulonglong i;

#define GM 10000000000LL
#define BAS 2000000

int main()
{
  ulonglong *pb, *pb2;
  char *art, *art2;
  ulonglong ci;

  art= malloc(6*BAS);
  pb= malloc(sizeof(ulonglong)*BAS);
  for (i=0; i<6*BAS; i++)
    art[i]= (char)i;

  art2= malloc(6*BAS);
  pb2= malloc(sizeof(ulonglong)*BAS);
  for (i=0; i<6*BAS; i++)
    art2[i]= (char)i;

#ifdef TEST_KORR6
  ci= 0;
  t0= time(0);
  for (i=0; i<GM; i++)
  {
    if (ci >= BAS)
      ci= 0;

    pb[ci]= uint6korr(art+ci*6);
    ci++;
  }
  t1= time(0);
  printf("elapsed %d seconds on korr6-1\n", t1 - t0);

  ci= 0;
  t0= time(0);
  for (i=0; i<GM; i++)
  {
    if (ci >= BAS)
      ci= 0;
    pb[ci]= uint6korr(art+ci*6);
    pb2[ci]= uint6korr(art2+ci*6);
    ci++;
  }
  t1= time(0);
  printf("elapsed %d seconds on korr6-2\n", t1 - t0);
#endif /*KORR6*/

#ifdef TEST_HF_KORR6
  ci= 0;
  t0= time(0);
  for (i=0; i<GM; i++)
  {
    if (ci >= BAS)
      ci= 0;

    pb[ci]= hf_uint6korr(art+ci*6);
    ci++;
  }
  t1= time(0);
  printf("elapsed %d seconds on hf_korr6-1\n", t1 - t0);

  ci= 0;
  t0= time(0);
  for (i=0; i<GM; i++)
  {
    if (ci >= BAS)
      ci= 0;
    pb[ci]= hf_uint6korr(art+ci*6);
    pb2[ci]= hf_uint6korr(art2+ci*6);
    ci++;
  }
  t1= time(0);
  printf("elapsed %d seconds on hf_korr6-2\n", t1 - t0);
#endif /*HF_KORR6*/

#ifdef TEST_MI6
  ci= 0;
  t0= time(0);
  for (i=0; i<GM; i++)
  {
    if (ci >= BAS)
      ci= 0;

    pb[ci]= mi_uint6korr(art+ci*6);
    ci++;
  }
  t1= time(0);
  printf("elapsed %d seconds on mi6-1\n", t1 - t0);

  ci= 0;
  t0= time(0);
  for (i=0; i<GM; i++)
  {
    if (ci >= BAS)
      ci= 0;
    pb[ci]= mi_uint6korr(art+ci*6);
    pb2[ci]= mi_uint6korr(art2+ci*6);
    ci++;
  }
  t1= time(0);
  printf("elapsed %d seconds on mi6-2\n", t1 - t0);
#endif /*MI6*/

#ifdef TEST_HF_MI6
  ci= 0;
  t0= time(0);
  for (i=0; i<GM; i++)
  {
    if (ci >= BAS)
      ci= 0;
    hf_mi_uint6korr(art+ci*6, pb[ci]);
    ci++;
  }
  t1= time(0);
  printf("elapsed %d seconds on hf_mi6B-1\n", t1 - t0);

  ci= 0;
  t0= time(0);
  for (i=0; i<GM; i++)
  {
    if (ci >= BAS)
      ci= 0;
    hf_mi_uint6korr(art+ci*6, pb[ci]);
    hf_mi_uint6korr(art2+ci*6, pb2[ci]);
    ci++;
  }
  t1= time(0);
  printf("elapsed %d seconds on hf_mi6-2\n", t1 - t0);
#endif /*HF_MI6*/

  return 0;
}


Looks ok.

Did you try with:

#define hf_uint6korr(A) (*((uint64 *) A) & 0xffffffffffffLL) 

That may be faster, with the extra cost problem of reading 2 bytes
extra, so this only safe to use if one knows that there is a few extra
bytes in the buffer.

Can you do a patch to my_global.h and myisampack and add the new
functions there.

What to do:

- All improvements for now only for x64 architecture

- Replace uint6korr(A) and any other function that can be improved
  without reading extra bytes with a faster version.

- Improve with faster variants:
   mi_uint3korr
   mi_uint4korr
   mi_uint5korr
   mi_uint6korr
   mi_uint7korr
   mi_uint8korr
   uint5korr
   uint6korr
   uint8korr

If the above suggested version of uint6korr() that reads extra bytes
is notable faster, add a function:

uint6korr_unsafe()

Which of course should map to uint6korr for other architectures.

ok?

Regards,
Monty


Follow ups