Replaced vmovdqa with vmovdqu because the memory is not aligned to 256 bits. Next align the memory and then use the aligned assembly instruction
This commit is contained in:
parent
cbf7ecee92
commit
7345dd2665
5 changed files with 36 additions and 35 deletions
|
@ -3,7 +3,6 @@
|
||||||
#include "../src/cpu/x86/sse.hpp"
|
#include "../src/cpu/x86/sse.hpp"
|
||||||
#include "../src/cpu/naive.hpp"
|
#include "../src/cpu/naive.hpp"
|
||||||
#include "../src/platform/timer.hpp"
|
#include "../src/platform/timer.hpp"
|
||||||
#include <sys/time.h>
|
|
||||||
|
|
||||||
using namespace cpu;
|
using namespace cpu;
|
||||||
using namespace platform;
|
using namespace platform;
|
||||||
|
|
|
@ -38,7 +38,7 @@
|
||||||
typedef unsigned char UChar;
|
typedef unsigned char UChar;
|
||||||
typedef unsigned short UShort;
|
typedef unsigned short UShort;
|
||||||
typedef unsigned int UInt;
|
typedef unsigned int UInt;
|
||||||
typedef unsigned long ULong;
|
typedef unsigned long int ULong;
|
||||||
|
|
||||||
// SSE DataTypes
|
// SSE DataTypes
|
||||||
#define CHAR_LEN_128 16
|
#define CHAR_LEN_128 16
|
||||||
|
|
|
@ -50,7 +50,7 @@ namespace cpu
|
||||||
packedULong[0] = (ULong)packedUInteger[1] << 32 | packedUInteger[0];
|
packedULong[0] = (ULong)packedUInteger[1] << 32 | packedUInteger[0];
|
||||||
packedULong[1] = (ULong)packedUInteger[3] << 32 | packedUInteger[2];
|
packedULong[1] = (ULong)packedUInteger[3] << 32 | packedUInteger[2];
|
||||||
packedULong[2] = (ULong)packedUInteger[5] << 32 | packedUInteger[4];
|
packedULong[2] = (ULong)packedUInteger[5] << 32 | packedUInteger[4];
|
||||||
packedULong[3] = (ULong)packedUInteger[7] << 32 | packedUInteger[7];
|
packedULong[3] = (ULong)packedUInteger[7] << 32 | packedUInteger[6];
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -1,9 +1,11 @@
|
||||||
#include "sse.hpp"
|
#include "sse.hpp"
|
||||||
|
|
||||||
|
|
||||||
#ifdef ARCH_X86
|
#ifdef ARCH_X86
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
void cpu::SSE::add_128(UChar *a,UChar *b){
|
void cpu::SSE::add_128(UChar *a,UChar *b){
|
||||||
__asm__ __volatile__ (
|
asm volatile (
|
||||||
"movdqa %0,%%xmm1\n"
|
"movdqa %0,%%xmm1\n"
|
||||||
"paddb %1,%%xmm1\n"
|
"paddb %1,%%xmm1\n"
|
||||||
"movdqa %%xmm1,%0"
|
"movdqa %%xmm1,%0"
|
||||||
|
@ -14,9 +16,9 @@
|
||||||
|
|
||||||
//X86 Assembly to add two 128 bit numbers in the form of packed integers 32bit
|
//X86 Assembly to add two 128 bit numbers in the form of packed integers 32bit
|
||||||
void cpu::SSE::add_128(UInt *a,UInt *b) {
|
void cpu::SSE::add_128(UInt *a,UInt *b) {
|
||||||
__asm__ __volatile__ (
|
asm volatile (
|
||||||
"movdqa %0, %%xmm1\n"
|
"movdqa %0,%%xmm1\n"
|
||||||
"paddw %1, %%xmm1\n"
|
"paddw %1,%%xmm1\n"
|
||||||
"movdqa %%xmm1, %0"
|
"movdqa %%xmm1, %0"
|
||||||
: "=m"(*a)
|
: "=m"(*a)
|
||||||
: "m"(*b)
|
: "m"(*b)
|
||||||
|
@ -26,9 +28,9 @@
|
||||||
|
|
||||||
//X86 Assembly to add two 128 bit numbers in the form of packed long 64bit
|
//X86 Assembly to add two 128 bit numbers in the form of packed long 64bit
|
||||||
void cpu::SSE::add_128(ULong *a,ULong *b) {
|
void cpu::SSE::add_128(ULong *a,ULong *b) {
|
||||||
__asm__ __volatile__ (
|
asm volatile (
|
||||||
"movdqa %0, %%xmm1\n"
|
"movdqa %0,%%xmm1\n"
|
||||||
"paddd %1, %%xmm1\n"
|
"paddd %1,%%xmm1\n"
|
||||||
"movdqa %%xmm1, %0"
|
"movdqa %%xmm1, %0"
|
||||||
: "=m"(*a)
|
: "=m"(*a)
|
||||||
: "m"(*b)
|
: "m"(*b)
|
||||||
|
@ -37,35 +39,35 @@
|
||||||
|
|
||||||
//X86 Assembly to add two 256 bit numbers in the form of packed byte vector
|
//X86 Assembly to add two 256 bit numbers in the form of packed byte vector
|
||||||
void cpu::SSE::add_256(UChar *a,UChar *b) {
|
void cpu::SSE::add_256(UChar *a,UChar *b) {
|
||||||
__asm__ __volatile__ (
|
asm volatile (
|
||||||
"vmovdqa %0, %%ymm1\n"
|
"vmovdqu %0,%%ymm1\n"
|
||||||
"vmovdqa %1, %%ymm2\n"
|
"vmovdqu %1,%%ymm2\n"
|
||||||
"vpaddb %%ymm1, %%ymm1, %%ymm2\n"
|
"vpaddb %%ymm3,%%ymm2,%%ymm1\n"
|
||||||
"vmovdqa %%ymm1, %0"
|
"vmovdqu %%ymm1,%0"
|
||||||
: "=m"(*a)
|
: "=m"(*a)
|
||||||
: "m"(*b)
|
: "m"(*b)
|
||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|
||||||
//X86 Assembly to add two 128 bit numbers in the form of packed long 32bit
|
//X86 Assembly to add two 256 bit numbers in the form of packed int 32bit
|
||||||
void cpu::SSE::add_256(UInt *a,UInt *b) {
|
void cpu::SSE::add_256(UInt *a,UInt *b) {
|
||||||
__asm__ __volatile__(
|
asm volatile(
|
||||||
"vmovdqa %0, %%xmm1\n"
|
"vmovdqu %0,%%ymm1\n"
|
||||||
"vmovdqa %1, %%ymm2\n"
|
"vmovdqu %1,%%ymm2\n"
|
||||||
"vpaddw %%ymm1, %%ymm1, %%ymm2\n"
|
"vpaddw %%ymm1, %%ymm2, %%ymm1\n"
|
||||||
"vmovdqa %%ymm1, %0"
|
"vmovdqu %%ymm1,%0"
|
||||||
: "=m"(*a)
|
: "=m"(*a)
|
||||||
: "m"(*b)
|
: "m"(*b)
|
||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|
||||||
//X86 Assembly to add two 128 bit numbers in the form of packed long 64bit
|
//X86 Assembly to add two 256 bit numbers in the form of packed long 64bit
|
||||||
void cpu::SSE::add_256(ULong *a,ULong *b) {
|
void cpu::SSE::add_256(ULong *a,ULong *b) {
|
||||||
__asm__ __volatile__(
|
asm volatile(
|
||||||
"vmovdqa %0, %%ymm1\n"
|
"vmovdqu %0, %%ymm1\n"
|
||||||
"vmovdqa %1, %%ymm2\n"
|
"vmovdqu %1, %%ymm2\n"
|
||||||
"vpaddd %%ymm1, %%ymm1, %%ymm2\n"
|
"vpaddd %%ymm1,%%ymm2,%%ymm1\n"
|
||||||
"vmovdqa %%ymm1, %0"
|
"vmovdqu %%ymm1, %0"
|
||||||
: "=m"(*a)
|
: "=m"(*a)
|
||||||
: "m"(*b)
|
: "m"(*b)
|
||||||
);
|
);
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <sys/time.h>
|
#include <sys/time.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
namespace platform
|
namespace platform
|
||||||
{
|
{
|
||||||
|
@ -11,26 +12,25 @@ namespace platform
|
||||||
std::string label;
|
std::string label;
|
||||||
long int start;
|
long int start;
|
||||||
long int end;
|
long int end;
|
||||||
|
|
||||||
long int gettime()
|
long int gettime()
|
||||||
{
|
{
|
||||||
struct timeval tp;
|
struct timeval tp;
|
||||||
gettimeofday(&tp, NULL);
|
gettimeofday(&tp, NULL);
|
||||||
long int ms = tp.tv_sec * 1000 + tp.tv_usec / 1000;
|
return (double)(tp.tv_sec * 1000 + (double)tp.tv_usec / 1000);
|
||||||
return ms;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
public:
|
public:
|
||||||
Timer(const char* lbl) : label(lbl)
|
Timer(const char *lbl)
|
||||||
{
|
{
|
||||||
this->label=std::string(lbl);
|
this->label = std::string(lbl);
|
||||||
this->start = gettime();
|
this->start = this->gettime();
|
||||||
}
|
}
|
||||||
|
|
||||||
~Timer()
|
~Timer()
|
||||||
{
|
{
|
||||||
this->end=this->gettime();
|
this->end = this->gettime();
|
||||||
unsigned long diff = this->end - this->start;
|
unsigned long diff = this->end - this->start;
|
||||||
std::cout << this->label << diff << std::endl;
|
std::cout << this->label << diff << std::endl;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
Loading…
Reference in a new issue