L
ligazetom
Guest
Hello all,
I am using multiple threads to solve task in image processing faster. The thing is that no matter what I do, it gets worse with higher thread count. So just to make sure, I made the most minimalistic version I could, to replicate the problem. Every thread creates its own 10x10 matrix and recursively finds its determinant.
First thing to notice is that on linux it takes 500ms for one thread and on windows 4800ms. I mean, whatever, but it is more than 10 times difference.
But the bad thing is that if I add more threads, it gets way worse.
Windows-MSVC 64-bit:
1 Thread = 4479ms
2 Threads = 7500ms
3 Threads = 11300ms
4 Threads = 15800 ms
Windows-mingw 64-bit:
1 Thread = 2389ms
2 Threads = 3000ms
3 Threads = 3500ms
4 Threads = from 3700ms to 4500ms
Linux-gcc 64-bit:
1 Thread = 490ms
2 Threads = 478ms
3 Threads = First: 503ms; Other two: 1230ms
4 Threads = 1340ms
Obviously Linux is so way ahead I cried and decided to move my project to Linux, because it is really CPU intensive and these times are totally different level. However I wonder, why MSVC produces code that sucks that much. No thread waits for other one to finish. The only thing they do is write to console and it is definitely not the bottleneck. If any of you have any idea what could be the culprit please tell me. I used visual studio community 2017 for msvc build and codeblocks for mingw.
Tested on Win7/Linux intel 2C+2T and Win10 ryzen 8C+8T.
I posted similar question on stackoverflow, but since I found out mingw gives way better results so windows is not the only one to blame.
Thanks
Test code:
#include <iostream>
#include <cmath>
#include <thread>
#include <chrono>
#include <float.h>
class FVector
{
public:
FVector();
FVector(int length);
FVector(const FVector &vec);
FVector(FVector &&vec);
FVector &operator=(const FVector &vec);
FVector &operator=(FVector &&vec);
~FVector();
void setLength(int length);
int getLength() const;
double *getData();
const double* getConstData() const;
private:
double *data;
int length;
void allocateDataArray(int length);
void deallocateDataArray();
};
FVector::FVector() {
data = nullptr;
length = 0;
}
FVector::FVector(int length) {
data = nullptr;
this->length = length;
allocateDataArray(length);
for (int i = 0; i < length; i++) {
data = 0.;
}
}
FVector::FVector(const FVector &vec) {
allocateDataArray(vec.length);
length = vec.length;
for (int i = 0; i < length; i++) {
data = vec.data;
}
}
FVector::FVector(FVector &&vec) {
data = vec.data;
vec.data = nullptr;
length = vec.length;
}
FVector &FVector:perator=(const FVector &vec) {
deallocateDataArray();
if (data == nullptr) {
allocateDataArray(vec.length);
for (int i = 0; i < vec.length; i++) {
data = vec.data;
}
length = vec.length;
}
return *this;
}
FVector &FVector:perator=(FVector &&vec) {
deallocateDataArray();
if (data == nullptr) {
data = vec.data;
vec.data = nullptr;
length = vec.length;
}
return *this;
}
FVector::~FVector() {
deallocateDataArray();
}
void FVector::allocateDataArray(int length) {
data = new double[length];
}
void FVector::deallocateDataArray() {
if (data != nullptr) {
delete[] data;
}
data = nullptr;
}
int FVector::getLength() const {
return length;
}
double *FVector::getData() {
return data;
}
void FVector::setLength(int length) {
deallocateDataArray();
allocateDataArray(length);
this->length = length;
}
const double* FVector::getConstData() const {
return data;
}
class FMatrix
{
public:
FMatrix();
FMatrix(int columns, int rows);
FMatrix(const FMatrix &mat);
FMatrix(FMatrix &&mat);
FMatrix& operator=(const FMatrix &mat);
FMatrix& operator=(FMatrix &&mat);
~FMatrix();
FVector *getData();
const FVector* getConstData() const;
void makeIdentity();
int determinant() const;
private:
FVector *data;
int columns;
int rows;
void deallocateDataArray();
void allocateDataArray(int count);
};
FMatrix::FMatrix() {
data = nullptr;
columns = 0;
rows = 0;
}
FMatrix::FMatrix(int columns, int rows) {
data = nullptr;
allocateDataArray(columns);
for (int i = 0; i < columns; i++) {
data.setLength(rows);
}
this->columns = columns;
this->rows = rows;
}
FMatrix::FMatrix(const FMatrix &mat) {
data = nullptr;
allocateDataArray(mat.columns);
for (int i = 0; i < mat.columns; i++) {
data.setLength(mat.data.getLength());
data = mat.data;
}
columns = mat.columns;
rows = mat.rows;
}
FMatrix::FMatrix(FMatrix &&mat) {
data = mat.data;
mat.data = nullptr;
columns = mat.columns;
rows = mat.rows;
}
FMatrix &FMatrix:perator=(const FMatrix &mat) {
deallocateDataArray();
if (data == nullptr) {
allocateDataArray(mat.columns);
for (int i = 0; i < mat.columns; i++) {
data.setLength(mat.rows);
data = mat.data;
}
}
columns = mat.columns;
rows = mat.rows;
return *this;
}
FMatrix &FMatrix:perator=(FMatrix &&mat) {
deallocateDataArray();
data = mat.data;
mat.data = nullptr;
columns = mat.columns;
rows = mat.rows;
return *this;
}
FMatrix::~FMatrix() {
deallocateDataArray();
}
void FMatrix::deallocateDataArray() {
if (data != nullptr) {
delete[] data;
}
data = nullptr;
}
void FMatrix::allocateDataArray(int count) {
data = new FVector[count];
}
FVector *FMatrix::getData() {
return data;
}
void FMatrix::makeIdentity() {
for (int i = 0; i < columns; i++) {
for (int j = 0; j < rows; j++) {
if (i == j) {
data.getData()[j] = 1.;
}
else {
data.getData()[j] = 0.;
}
}
}
}
int FMatrix::determinant() const {
int det = 0;
FMatrix subMatrix(columns - 1, rows - 1);
int subi;
if (columns == rows && rows == 1) {
return data[0].getData()[0];
}
if (columns != rows) {
//throw EXCEPTIONS::SINGULAR_MATRIX;
}
if (columns == 2)
return ((data[0].getConstData()[0] * data[1].getConstData()[1]) - (data[1].getConstData()[0] * data[0].getConstData()[1]));
else {
for (int x = 0; x < columns; x++) {
subi = 0;
for (int i = 0; i < columns; i++) {
for (int j = 1; j < columns; j++) {
if (x == i) {
continue;
}
subMatrix.data[subi].getData()[j - 1] = data.getConstData()[j];
}
if (x != i) {
subi++;
}
}
det += (pow(-1, x) * data[x].getConstData()[0] * subMatrix.determinant());
}
}
return det;
}
const FVector* FMatrix::getConstData() const {
return data;
}
class FCore
{
public:
FCore();
~FCore();
void process();
private:
int getMaxThreads() const;
void joinThreads(std::thread *threads, int max);
};
void parallelTest(int i) {
auto start = std::chrono::high_resolution_clock::now();
FMatrix m(10, 10);
m.makeIdentity();
std::cout << "Det: " << i << "= " << m.determinant() << std::endl;
auto finish = std::chrono::high_resolution_clock::now();
auto microseconds = std::chrono::duration_cast<std::chrono::microseconds>(finish - start);
std::cout << "Time: " << microseconds.count() / 1000. << std::endl;
}
FCore::FCore()
{
}
FCore::~FCore()
{
}
void FCore:rocess() {
/*********************************************/
/*Set this to limit number of created threads*/
int threadCount = getMaxThreads();
/*********************************************/
/*********************************************/
std::cout << "Thread count: " << threadCount;
std::thread *threads = new std::thread[threadCount];
for (int i = 0; i < threadCount; i++) {
threads = std::thread(parallelTest, i);
}
joinThreads(threads, threadCount);
delete[] threads;
getchar();
}
int FCore::getMaxThreads() const {
int count = std::thread::hardware_concurrency();
if (count == 0) {
return 1;
}
else {
return count;
}
}
void FCore::joinThreads(std::thread *threads, int max) {
for (int i = 0; i < max; i++) {
threads.join();
}
}
int main() {
FCore core;
core.process();
return 0;
}
Only the last few methods are important, other code is just setting up classes and constructors.
Continue reading...
I am using multiple threads to solve task in image processing faster. The thing is that no matter what I do, it gets worse with higher thread count. So just to make sure, I made the most minimalistic version I could, to replicate the problem. Every thread creates its own 10x10 matrix and recursively finds its determinant.
First thing to notice is that on linux it takes 500ms for one thread and on windows 4800ms. I mean, whatever, but it is more than 10 times difference.
But the bad thing is that if I add more threads, it gets way worse.
Windows-MSVC 64-bit:
1 Thread = 4479ms
2 Threads = 7500ms
3 Threads = 11300ms
4 Threads = 15800 ms
Windows-mingw 64-bit:
1 Thread = 2389ms
2 Threads = 3000ms
3 Threads = 3500ms
4 Threads = from 3700ms to 4500ms
Linux-gcc 64-bit:
1 Thread = 490ms
2 Threads = 478ms
3 Threads = First: 503ms; Other two: 1230ms
4 Threads = 1340ms
Obviously Linux is so way ahead I cried and decided to move my project to Linux, because it is really CPU intensive and these times are totally different level. However I wonder, why MSVC produces code that sucks that much. No thread waits for other one to finish. The only thing they do is write to console and it is definitely not the bottleneck. If any of you have any idea what could be the culprit please tell me. I used visual studio community 2017 for msvc build and codeblocks for mingw.
Tested on Win7/Linux intel 2C+2T and Win10 ryzen 8C+8T.
I posted similar question on stackoverflow, but since I found out mingw gives way better results so windows is not the only one to blame.
Thanks
Test code:
#include <iostream>
#include <cmath>
#include <thread>
#include <chrono>
#include <float.h>
class FVector
{
public:
FVector();
FVector(int length);
FVector(const FVector &vec);
FVector(FVector &&vec);
FVector &operator=(const FVector &vec);
FVector &operator=(FVector &&vec);
~FVector();
void setLength(int length);
int getLength() const;
double *getData();
const double* getConstData() const;
private:
double *data;
int length;
void allocateDataArray(int length);
void deallocateDataArray();
};
FVector::FVector() {
data = nullptr;
length = 0;
}
FVector::FVector(int length) {
data = nullptr;
this->length = length;
allocateDataArray(length);
for (int i = 0; i < length; i++) {
data = 0.;
}
}
FVector::FVector(const FVector &vec) {
allocateDataArray(vec.length);
length = vec.length;
for (int i = 0; i < length; i++) {
data = vec.data;
}
}
FVector::FVector(FVector &&vec) {
data = vec.data;
vec.data = nullptr;
length = vec.length;
}
FVector &FVector:perator=(const FVector &vec) {
deallocateDataArray();
if (data == nullptr) {
allocateDataArray(vec.length);
for (int i = 0; i < vec.length; i++) {
data = vec.data;
}
length = vec.length;
}
return *this;
}
FVector &FVector:perator=(FVector &&vec) {
deallocateDataArray();
if (data == nullptr) {
data = vec.data;
vec.data = nullptr;
length = vec.length;
}
return *this;
}
FVector::~FVector() {
deallocateDataArray();
}
void FVector::allocateDataArray(int length) {
data = new double[length];
}
void FVector::deallocateDataArray() {
if (data != nullptr) {
delete[] data;
}
data = nullptr;
}
int FVector::getLength() const {
return length;
}
double *FVector::getData() {
return data;
}
void FVector::setLength(int length) {
deallocateDataArray();
allocateDataArray(length);
this->length = length;
}
const double* FVector::getConstData() const {
return data;
}
class FMatrix
{
public:
FMatrix();
FMatrix(int columns, int rows);
FMatrix(const FMatrix &mat);
FMatrix(FMatrix &&mat);
FMatrix& operator=(const FMatrix &mat);
FMatrix& operator=(FMatrix &&mat);
~FMatrix();
FVector *getData();
const FVector* getConstData() const;
void makeIdentity();
int determinant() const;
private:
FVector *data;
int columns;
int rows;
void deallocateDataArray();
void allocateDataArray(int count);
};
FMatrix::FMatrix() {
data = nullptr;
columns = 0;
rows = 0;
}
FMatrix::FMatrix(int columns, int rows) {
data = nullptr;
allocateDataArray(columns);
for (int i = 0; i < columns; i++) {
data.setLength(rows);
}
this->columns = columns;
this->rows = rows;
}
FMatrix::FMatrix(const FMatrix &mat) {
data = nullptr;
allocateDataArray(mat.columns);
for (int i = 0; i < mat.columns; i++) {
data.setLength(mat.data.getLength());
data = mat.data;
}
columns = mat.columns;
rows = mat.rows;
}
FMatrix::FMatrix(FMatrix &&mat) {
data = mat.data;
mat.data = nullptr;
columns = mat.columns;
rows = mat.rows;
}
FMatrix &FMatrix:perator=(const FMatrix &mat) {
deallocateDataArray();
if (data == nullptr) {
allocateDataArray(mat.columns);
for (int i = 0; i < mat.columns; i++) {
data.setLength(mat.rows);
data = mat.data;
}
}
columns = mat.columns;
rows = mat.rows;
return *this;
}
FMatrix &FMatrix:perator=(FMatrix &&mat) {
deallocateDataArray();
data = mat.data;
mat.data = nullptr;
columns = mat.columns;
rows = mat.rows;
return *this;
}
FMatrix::~FMatrix() {
deallocateDataArray();
}
void FMatrix::deallocateDataArray() {
if (data != nullptr) {
delete[] data;
}
data = nullptr;
}
void FMatrix::allocateDataArray(int count) {
data = new FVector[count];
}
FVector *FMatrix::getData() {
return data;
}
void FMatrix::makeIdentity() {
for (int i = 0; i < columns; i++) {
for (int j = 0; j < rows; j++) {
if (i == j) {
data.getData()[j] = 1.;
}
else {
data.getData()[j] = 0.;
}
}
}
}
int FMatrix::determinant() const {
int det = 0;
FMatrix subMatrix(columns - 1, rows - 1);
int subi;
if (columns == rows && rows == 1) {
return data[0].getData()[0];
}
if (columns != rows) {
//throw EXCEPTIONS::SINGULAR_MATRIX;
}
if (columns == 2)
return ((data[0].getConstData()[0] * data[1].getConstData()[1]) - (data[1].getConstData()[0] * data[0].getConstData()[1]));
else {
for (int x = 0; x < columns; x++) {
subi = 0;
for (int i = 0; i < columns; i++) {
for (int j = 1; j < columns; j++) {
if (x == i) {
continue;
}
subMatrix.data[subi].getData()[j - 1] = data.getConstData()[j];
}
if (x != i) {
subi++;
}
}
det += (pow(-1, x) * data[x].getConstData()[0] * subMatrix.determinant());
}
}
return det;
}
const FVector* FMatrix::getConstData() const {
return data;
}
class FCore
{
public:
FCore();
~FCore();
void process();
private:
int getMaxThreads() const;
void joinThreads(std::thread *threads, int max);
};
void parallelTest(int i) {
auto start = std::chrono::high_resolution_clock::now();
FMatrix m(10, 10);
m.makeIdentity();
std::cout << "Det: " << i << "= " << m.determinant() << std::endl;
auto finish = std::chrono::high_resolution_clock::now();
auto microseconds = std::chrono::duration_cast<std::chrono::microseconds>(finish - start);
std::cout << "Time: " << microseconds.count() / 1000. << std::endl;
}
FCore::FCore()
{
}
FCore::~FCore()
{
}
void FCore:rocess() {
/*********************************************/
/*Set this to limit number of created threads*/
int threadCount = getMaxThreads();
/*********************************************/
/*********************************************/
std::cout << "Thread count: " << threadCount;
std::thread *threads = new std::thread[threadCount];
for (int i = 0; i < threadCount; i++) {
threads = std::thread(parallelTest, i);
}
joinThreads(threads, threadCount);
delete[] threads;
getchar();
}
int FCore::getMaxThreads() const {
int count = std::thread::hardware_concurrency();
if (count == 0) {
return 1;
}
else {
return count;
}
}
void FCore::joinThreads(std::thread *threads, int max) {
for (int i = 0; i < max; i++) {
threads.join();
}
}
int main() {
FCore core;
core.process();
return 0;
}
Only the last few methods are important, other code is just setting up classes and constructors.
Continue reading...