Replaced vmovdqa with vmovdqu because the memory is not aligned to 256 bits. Next align the memory and then use the aligned assembly instruction

This commit is contained in:
balhau@balhau.net 2021-12-25 19:14:49 +00:00
parent cbf7ecee92
commit 7345dd2665
No known key found for this signature in database
GPG key ID: 1E666F326A121830
5 changed files with 36 additions and 35 deletions

View file

@ -3,7 +3,6 @@
#include "../src/cpu/x86/sse.hpp" #include "../src/cpu/x86/sse.hpp"
#include "../src/cpu/naive.hpp" #include "../src/cpu/naive.hpp"
#include "../src/platform/timer.hpp" #include "../src/platform/timer.hpp"
#include <sys/time.h>
using namespace cpu; using namespace cpu;
using namespace platform; using namespace platform;

View file

@ -38,7 +38,7 @@
typedef unsigned char UChar; typedef unsigned char UChar;
typedef unsigned short UShort; typedef unsigned short UShort;
typedef unsigned int UInt; typedef unsigned int UInt;
typedef unsigned long ULong; typedef unsigned long int ULong;
// SSE DataTypes // SSE DataTypes
#define CHAR_LEN_128 16 #define CHAR_LEN_128 16

View file

@ -50,7 +50,7 @@ namespace cpu
packedULong[0] = (ULong)packedUInteger[1] << 32 | packedUInteger[0]; packedULong[0] = (ULong)packedUInteger[1] << 32 | packedUInteger[0];
packedULong[1] = (ULong)packedUInteger[3] << 32 | packedUInteger[2]; packedULong[1] = (ULong)packedUInteger[3] << 32 | packedUInteger[2];
packedULong[2] = (ULong)packedUInteger[5] << 32 | packedUInteger[4]; packedULong[2] = (ULong)packedUInteger[5] << 32 | packedUInteger[4];
packedULong[3] = (ULong)packedUInteger[7] << 32 | packedUInteger[7]; packedULong[3] = (ULong)packedUInteger[7] << 32 | packedUInteger[6];
} }
/** /**

View file

@ -1,9 +1,11 @@
#include "sse.hpp" #include "sse.hpp"
#ifdef ARCH_X86 #ifdef ARCH_X86
#include <iostream> #include <iostream>
void cpu::SSE::add_128(UChar *a,UChar *b){ void cpu::SSE::add_128(UChar *a,UChar *b){
__asm__ __volatile__ ( asm volatile (
"movdqa %0,%%xmm1\n" "movdqa %0,%%xmm1\n"
"paddb %1,%%xmm1\n" "paddb %1,%%xmm1\n"
"movdqa %%xmm1,%0" "movdqa %%xmm1,%0"
@ -14,9 +16,9 @@
//X86 Assembly to add two 128 bit numbers in the form of packed integers 32bit //X86 Assembly to add two 128 bit numbers in the form of packed integers 32bit
void cpu::SSE::add_128(UInt *a,UInt *b) { void cpu::SSE::add_128(UInt *a,UInt *b) {
__asm__ __volatile__ ( asm volatile (
"movdqa %0, %%xmm1\n" "movdqa %0,%%xmm1\n"
"paddw %1, %%xmm1\n" "paddw %1,%%xmm1\n"
"movdqa %%xmm1, %0" "movdqa %%xmm1, %0"
: "=m"(*a) : "=m"(*a)
: "m"(*b) : "m"(*b)
@ -26,9 +28,9 @@
//X86 Assembly to add two 128 bit numbers in the form of packed long 64bit //X86 Assembly to add two 128 bit numbers in the form of packed long 64bit
void cpu::SSE::add_128(ULong *a,ULong *b) { void cpu::SSE::add_128(ULong *a,ULong *b) {
__asm__ __volatile__ ( asm volatile (
"movdqa %0, %%xmm1\n" "movdqa %0,%%xmm1\n"
"paddd %1, %%xmm1\n" "paddd %1,%%xmm1\n"
"movdqa %%xmm1, %0" "movdqa %%xmm1, %0"
: "=m"(*a) : "=m"(*a)
: "m"(*b) : "m"(*b)
@ -37,35 +39,35 @@
//X86 Assembly to add two 256 bit numbers in the form of packed byte vector //X86 Assembly to add two 256 bit numbers in the form of packed byte vector
void cpu::SSE::add_256(UChar *a,UChar *b) { void cpu::SSE::add_256(UChar *a,UChar *b) {
__asm__ __volatile__ ( asm volatile (
"vmovdqa %0, %%ymm1\n" "vmovdqu %0,%%ymm1\n"
"vmovdqa %1, %%ymm2\n" "vmovdqu %1,%%ymm2\n"
"vpaddb %%ymm1, %%ymm1, %%ymm2\n" "vpaddb %%ymm3,%%ymm2,%%ymm1\n"
"vmovdqa %%ymm1, %0" "vmovdqu %%ymm1,%0"
: "=m"(*a) : "=m"(*a)
: "m"(*b) : "m"(*b)
); );
}; };
//X86 Assembly to add two 128 bit numbers in the form of packed long 32bit //X86 Assembly to add two 256 bit numbers in the form of packed int 32bit
void cpu::SSE::add_256(UInt *a,UInt *b) { void cpu::SSE::add_256(UInt *a,UInt *b) {
__asm__ __volatile__( asm volatile(
"vmovdqa %0, %%xmm1\n" "vmovdqu %0,%%ymm1\n"
"vmovdqa %1, %%ymm2\n" "vmovdqu %1,%%ymm2\n"
"vpaddw %%ymm1, %%ymm1, %%ymm2\n" "vpaddw %%ymm1, %%ymm2, %%ymm1\n"
"vmovdqa %%ymm1, %0" "vmovdqu %%ymm1,%0"
: "=m"(*a) : "=m"(*a)
: "m"(*b) : "m"(*b)
); );
}; };
//X86 Assembly to add two 128 bit numbers in the form of packed long 64bit //X86 Assembly to add two 256 bit numbers in the form of packed long 64bit
void cpu::SSE::add_256(ULong *a,ULong *b) { void cpu::SSE::add_256(ULong *a,ULong *b) {
__asm__ __volatile__( asm volatile(
"vmovdqa %0, %%ymm1\n" "vmovdqu %0, %%ymm1\n"
"vmovdqa %1, %%ymm2\n" "vmovdqu %1, %%ymm2\n"
"vpaddd %%ymm1, %%ymm1, %%ymm2\n" "vpaddd %%ymm1,%%ymm2,%%ymm1\n"
"vmovdqa %%ymm1, %0" "vmovdqu %%ymm1, %0"
: "=m"(*a) : "=m"(*a)
: "m"(*b) : "m"(*b)
); );

View file

@ -2,6 +2,7 @@
#include <iostream> #include <iostream>
#include <sys/time.h> #include <sys/time.h>
#include <unistd.h>
namespace platform namespace platform
{ {
@ -11,26 +12,25 @@ namespace platform
std::string label; std::string label;
long int start; long int start;
long int end; long int end;
long int gettime() long int gettime()
{ {
struct timeval tp; struct timeval tp;
gettimeofday(&tp, NULL); gettimeofday(&tp, NULL);
long int ms = tp.tv_sec * 1000 + tp.tv_usec / 1000; return (double)(tp.tv_sec * 1000 + (double)tp.tv_usec / 1000);
return ms;
}; };
public: public:
Timer(const char* lbl) : label(lbl) Timer(const char *lbl)
{ {
this->label=std::string(lbl); this->label = std::string(lbl);
this->start = gettime(); this->start = this->gettime();
} }
~Timer() ~Timer()
{ {
this->end=this->gettime(); this->end = this->gettime();
unsigned long diff = this->end - this->start; unsigned long diff = this->end - this->start;
std::cout << this->label << diff << std::endl; std::cout << this->label << diff << std::endl;
} }
}; };