← Back to team overview

maria-developers team mailing list archive

Re: uint6korr optimization

 

Hi all.

Only question i have with this is if there's a possibility to make the
hf_mi_uint6korr(dest,src) macro a function.
So we can write as usual
    dest= hf_mi_uint6korr(src)

Didn't find how that can nicely be done with the assembler code.

Best regards.
HF

23.01.2014 16:20, Michael Widenius wrote:
Hi!

"Alexey" == Alexey Botchkov <holyfoot@xxxxxxxxxxxx> writes:
Alexey> Hi, Monty.
Alexey> I've looked at that part of the code, and tried to make the improved
Alexey> versions of uint6korr and mi_uint6korr.

Alexey> The small program attached that benchmarks these.
Alexey> Depending on enabled macros it loops uint6korr, hf_uint6korr,
Alexey> mi_uint6korr and hf_mi_uint6korr respectively. It performs 2 loops on
Alexey> each functions first runs it once, the second - twice, so we can
Alexey> calculate how much time was spent on the operation itself.

Alexey> The results i got so far are:
Alexey> elapsed 103 seconds on korr6-1
Alexey> elapsed 190 seconds on korr6-2
Alexey> elapsed 50 seconds on hf_korr6-1
Alexey> elapsed 79 seconds on hf_korr6-2
Alexey> elapsed 106 seconds on mi6-1
Alexey> elapsed 195 seconds on mi6-2
Alexey> elapsed 56 seconds on hf_mi6B-1
Alexey> elapsed 88 seconds on hf_mi6-2

Alexey> So the
Alexey>       hf_uint6korr is 3 times faster than uint6korr.
Alexey>       hf_mi_uint6korr is 2.8 times faster than mi_uint6korr.

Alexey> You're welcome to check the code out.

Thanks.

What is important is to get fast versions of

mi_uint3korr  (Used a lot in ma_dynrec.c and gis)
mi_uint4korr
mi_uint5korr
mi_uint6korr
mi_uint7korr
mi_uint8korr  (Used for some variables)

uint5korr
uint6korr     (Used a lot in Aria)
uint8korr

(I am including the full test so that anyone can comment upon this)

----------------------------------------------------------------------
#include <stdio.h>
#include <time.h>
#include <malloc.h>

#define TEST_KORR6
#define TEST_HF_KORR6
#define TEST_MI6
#define TEST_HF_MI6

#define uint6korr(A)    ((ulonglong)(((uint32)    ((uchar) (A)[0]))          + \
                                            (((uint32)    ((uchar) (A)[1])) << 8)   + \
                                            (((uint32)    ((uchar) (A)[2])) << 16)  + \
                                            (((uint32)    ((uchar) (A)[3])) << 24)) + \
                              (((ulonglong) ((uchar) (A)[4])) << 32) +       \
                              (((ulonglong) ((uchar) (A)[5])) << 40))
#define mi_uint6korr(A) ((ulonglong)(((uint32) (((const uchar*) (A))[5])) +\
                                           (((uint32) (((const uchar*) (A))[4])) << 8) +\
                                           (((uint32) (((const uchar*) (A))[3])) << 16) +\
                                           (((uint32) (((const uchar*) (A))[2])) << 24)) +\
                             (((ulonglong) (((uint32) (((const uchar*) (A))[1])) +\
                                                                                (((uint32) (((const uchar*) (A))[0]) << 8)))) <<\
                                                                   32))

#define hf_uint6korr(A) (((ulonglong) ((uint32 *) (A))[0]) + (((ulonglong) ((uint16 *) (A))[2]) << 32))
#define hf_mi_uint6korr(src, dest) \
     __asm__ (                   \
       "bswapq %1;"                      \
       "mov %1, %0;"                     \
       :"=r"(dest)                     \
       :"r"(hf_uint6korr(src)<<16)                    \
       :                 \
     )

typedef unsigned long long int ulonglong;
typedef unsigned int uint32;
typedef unsigned char uchar;
typedef unsigned short uint16;

time_t t0, t1;
ulonglong i;

#define GM 10000000000LL
#define BAS 2000000

int main()
{
   ulonglong *pb, *pb2;
   char *art, *art2;
   ulonglong ci;

   art= malloc(6*BAS);
   pb= malloc(sizeof(ulonglong)*BAS);
   for (i=0; i<6*BAS; i++)
     art[i]= (char)i;

   art2= malloc(6*BAS);
   pb2= malloc(sizeof(ulonglong)*BAS);
   for (i=0; i<6*BAS; i++)
     art2[i]= (char)i;

#ifdef TEST_KORR6
   ci= 0;
   t0= time(0);
   for (i=0; i<GM; i++)
   {
     if (ci >= BAS)
       ci= 0;

     pb[ci]= uint6korr(art+ci*6);
     ci++;
   }
   t1= time(0);
   printf("elapsed %d seconds on korr6-1\n", t1 - t0);

   ci= 0;
   t0= time(0);
   for (i=0; i<GM; i++)
   {
     if (ci >= BAS)
       ci= 0;
     pb[ci]= uint6korr(art+ci*6);
     pb2[ci]= uint6korr(art2+ci*6);
     ci++;
   }
   t1= time(0);
   printf("elapsed %d seconds on korr6-2\n", t1 - t0);
#endif /*KORR6*/

#ifdef TEST_HF_KORR6
   ci= 0;
   t0= time(0);
   for (i=0; i<GM; i++)
   {
     if (ci >= BAS)
       ci= 0;

     pb[ci]= hf_uint6korr(art+ci*6);
     ci++;
   }
   t1= time(0);
   printf("elapsed %d seconds on hf_korr6-1\n", t1 - t0);

   ci= 0;
   t0= time(0);
   for (i=0; i<GM; i++)
   {
     if (ci >= BAS)
       ci= 0;
     pb[ci]= hf_uint6korr(art+ci*6);
     pb2[ci]= hf_uint6korr(art2+ci*6);
     ci++;
   }
   t1= time(0);
   printf("elapsed %d seconds on hf_korr6-2\n", t1 - t0);
#endif /*HF_KORR6*/

#ifdef TEST_MI6
   ci= 0;
   t0= time(0);
   for (i=0; i<GM; i++)
   {
     if (ci >= BAS)
       ci= 0;

     pb[ci]= mi_uint6korr(art+ci*6);
     ci++;
   }
   t1= time(0);
   printf("elapsed %d seconds on mi6-1\n", t1 - t0);

   ci= 0;
   t0= time(0);
   for (i=0; i<GM; i++)
   {
     if (ci >= BAS)
       ci= 0;
     pb[ci]= mi_uint6korr(art+ci*6);
     pb2[ci]= mi_uint6korr(art2+ci*6);
     ci++;
   }
   t1= time(0);
   printf("elapsed %d seconds on mi6-2\n", t1 - t0);
#endif /*MI6*/

#ifdef TEST_HF_MI6
   ci= 0;
   t0= time(0);
   for (i=0; i<GM; i++)
   {
     if (ci >= BAS)
       ci= 0;
     hf_mi_uint6korr(art+ci*6, pb[ci]);
     ci++;
   }
   t1= time(0);
   printf("elapsed %d seconds on hf_mi6B-1\n", t1 - t0);

   ci= 0;
   t0= time(0);
   for (i=0; i<GM; i++)
   {
     if (ci >= BAS)
       ci= 0;
     hf_mi_uint6korr(art+ci*6, pb[ci]);
     hf_mi_uint6korr(art2+ci*6, pb2[ci]);
     ci++;
   }
   t1= time(0);
   printf("elapsed %d seconds on hf_mi6-2\n", t1 - t0);
#endif /*HF_MI6*/

   return 0;
}


Looks ok.

Did you try with:

#define hf_uint6korr(A) (*((uint64 *) A) & 0xffffffffffffLL)

That may be faster, with the extra cost problem of reading 2 bytes
extra, so this only safe to use if one knows that there is a few extra
bytes in the buffer.

Can you do a patch to my_global.h and myisampack and add the new
functions there.

What to do:

- All improvements for now only for x64 architecture

- Replace uint6korr(A) and any other function that can be improved
   without reading extra bytes with a faster version.

- Improve with faster variants:
    mi_uint3korr
    mi_uint4korr
    mi_uint5korr
    mi_uint6korr
    mi_uint7korr
    mi_uint8korr
    uint5korr
    uint6korr
    uint8korr

If the above suggested version of uint6korr() that reads extra bytes
is notable faster, add a function:

uint6korr_unsafe()

Which of course should map to uint6korr for other architectures.

ok?

Regards,
Monty



Follow ups

References