C++ & cuda LNK2019: unresolved external symbol and LNK1120: 2 unresolved externals_

  • Thread starter Thread starter arezoo_moradi
  • Start date Start date
A

arezoo_moradi

Guest
I have a program that I using thrust:: vector in .cpp file and .cu file when compiling the program received three error which include ,

error LNK1120: 2 unresolved externals

error LNK2019: unresolved external symbol "class thrust::device_ptr<double> __cdecl thrust::cuda_cub::copy_n<struct thrust::cuda_cub::tag,double *,__int64,class thrust::device_ptr<double> >(struct thrust::cuda_cub::execution_policy<struct thrust::cuda_cub::tag> &,double *,__int64,class thrust::device_ptr<double>)" (??$copy_n@Utag@cuda_cub@thrust@@PEAN_JV?$device_ptr@N@3@@cuda_cub@thrust@@YA?AV?$device_ptr@N@1@AEAU?$execution_policy@Utag@cuda_cub@thrust@@@01@PEAN_JV21@@Z) referenced in function "class thrust::device_ptr<double> __cdecl thrust::cuda_cub::__copy::cross_system_copy_n<struct thrust::system::cpp::detail::tag,struct thrust::cuda_cub::tag,class std::_Vector_const_iterator<class std::_Vector_val<struct std::_Simple_types<double> > >,__int64,class thrust::device_ptr<double> >(struct thrust::system::cpp::detail::execution_policy<struct thrust::system::cpp::detail::tag> &,struct thrust::cuda_cub::execution_policy<struct thrust::cuda_cub::tag> &,class std::_Vector_const_iterator<class std::_Vector_val<struct std::_Simple_types<double> > >,__int64,class thrust::device_ptr<double>,struct thrust::detail::integral_constant<bool,0>)" (??$cross_system_copy_n@Utag@detail@cpp@system@thrust@@U1cuda_cub@5@V?$_Vector_const_iterator@V?$_Vector_val@U?$_Simple_types@N@std@@@std@@@std@@_JV?$device_ptr@N@5@@__copy@cuda_cub@thrust@@YA?AV?$device_ptr@N@2@AEAU?$execution_policy@Utag@detail@cpp@system@thrust@@@detail@cpp@system@2@AEAU?$execution_policy@Utag@cuda_cub@thrust@@@12@V?$_Vector_const_iterator@V?$_Vector_val@U?$_Simple_types@N@std@@@std@@@std@@_JV32@U?$integral_constant@_N$0A@@52@@Z)

error LNK2019: unresolved external symbol "void __cdecl decoder(class std::vector<double,class std::allocator<double> >,int,int,class std::vector<double,class std::allocator<double> >,int,int,class std::vector<double,class std::allocator<double> >,class std::vector<double,class std::allocator<double> >,class std::vector<double,class std::allocator<double> >,class std::vector<double,class std::allocator<double> >,int,int,class std::vector<double,class std::allocator<double> >,class std::basic_string<char,struct std::char_traits<char>,class std::allocator<char> >,int,int)" (?decoder@@YAXV?$vector@NV?$allocator@N@std@@@std@@HH0HH0V12@11HH1V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@2@HH@Z) referenced in function "void __cdecl Autoencoder(class std::vector<double,class std::allocator<double> >)" (?Autoencoder@@YAXV?$vector@NV?$allocator@N@std@@@std@@@Z)

Autoencoder.cpp

#include <string>
#include<float.h>
#include <random>
#include "kernel.cuh"
#include "Autoencoder.h"
#include <windows.h>
#include <cuda.h>
#include <iostream>
#include <time.h>
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/sequence.h>
#include <thrust/transform.h>
#include <thrust/copy.h>
#pragma comment(lib,"user32.lib")
using namespace std;
#define PI 3.141592654
#pragma once



#define COL 6
#define NERUN 200
#define CLasss 2
#define row_Data_train 242
#define row_Data_test 103
#define COL_NERUN 1200

void encoder(const vector<double>x_train, int row_Xtrain, int col_Xtrain, vector< double> a_f, int row_af, int col_af, const vector<double>b_f, vector< double> H_F, std::string G){
//
double *d_x_train, *d_a_f, *d_b_f, *d_x_a, *d_x_a_b , *d_H_F;
cudaMalloc((void **)&d_x_train, sizeof(double)* x_train.size());
cudaMalloc((void **)&d_a_f, sizeof(double)* a_f.size());
cudaMalloc((void **)&d_b_f, sizeof(double)*NERUN);
cudaMalloc((void **)&d_x_a, sizeof(double)* row_Xtrain * col_af);
cudaMalloc((void **)&d_x_a_b, sizeof(double)*row_Xtrain * col_af );
cudaMalloc((void **)&d_H_F, sizeof(double)*row_Xtrain * col_af);

// transfer data from host to device
cudaMemcpy(d_x_train, x_train.data(), sizeof(double)* x_train.size(), cudaMemcpyHostToDevice);
cudaMemcpy(d_a_f, a_f.data(), sizeof(double)* a_f.size(), cudaMemcpyHostToDevice);
cudaMemcpy(d_b_f, b_f.data(), sizeof(double)* NERUN, cudaMemcpyHostToDevice);
//////////////////////////////////////////------------call funcation-----------//////////////////////////////////////////
multi(d_x_train, d_a_f, d_x_a, row_Xtrain, col_Xtrain, row_af, col_af, row_Xtrain, col_af);
SUM(d_x_a, d_b_f, d_x_a_b, row_Xtrain * col_af, row_Xtrain, col_af);
if (G == "sigmoid"){
//sigmoid
Sigmoid(d_x_a_b, d_H_F, row_Xtrain * col_af);
}
if (G == "sinus"){
//sin
sinus(d_x_a_b, d_H_F, row_Xtrain * col_af);
}
/////////////////////////////////////////-----------------end---------------------///////////////////////////////////////
// device
cudaMemcpy(H_F.data(), d_H_F, sizeof(double)* row_Xtrain * col_af, cudaMemcpyDeviceToHost);
//
cudaFree(d_x_train);
cudaFree(d_a_f);
cudaFree(d_b_f);
cudaFree(d_x_a);
cudaFree(d_x_a_b);
cudaFree(d_H_F);
}

void decoder(const vector<double>x_train, int row_Xtrain, int col_Xtrain, const vector<double>a_f, int row_af, int col_af , const vector<double> b_f, vector<double> H_F, vector<double>HF_sudoinverse, vector<double>a_n, vector<double>b_n, std::string G, int N, int nerun){
int C;
double Max, Min;
//
double h_answer = 0, *d_mean;
cudaMalloc((void**)&d_mean, sizeof(double));
cudaMemcpy(d_mean, &h_answer, sizeof(double), cudaMemcpyHostToDevice);
printf("h_answer: %f \n", h_answer);

//host
double* inv = (double*)malloc(NERUN * NERUN * sizeof(double));

//device
double *d_x_train_norm ,*d_x_train_n_a ,*d_x_train_n_L , *d_H_F, *d_H_f_T,
*d_H_H_T1, *d_H_H_T2, *d_I, *d_H_H_T_I, *d_inv, *d_HF_sudoinverse , *d_a_n , *d_H_a_n;
//cudaMalloc((void **)&d_x_train, sizeof(double)* row_Data_train* COL);
cudaMalloc((void **)&d_x_train_norm, sizeof(double)* row_Xtrain* col_Xtrain);
cudaMalloc((void **)&d_x_train_n_a, sizeof(double)* row_Xtrain* col_Xtrain);
cudaMalloc((void **)&d_x_train_n_L, sizeof(double)* row_Xtrain* col_Xtrain);
cudaMalloc((void **)&d_H_F, sizeof(double)* H_F.size());
cudaMalloc((void **)&d_H_f_T, sizeof(double)* H_F.size());
cudaMalloc((void **)&d_H_H_T1, sizeof(double)* NERUN* NERUN);
cudaMalloc((void **)&d_H_H_T2, sizeof(double)* row_Data_train * row_Data_train );
cudaMalloc((void **)&d_I, sizeof(double) * NERUN * NERUN);
cudaMalloc((void **)&d_H_H_T_I, sizeof(double) * NERUN * NERUN);
cudaMalloc((void **)&d_inv, sizeof(double) * NERUN * NERUN);
cudaMalloc((void **)&d_HF_sudoinverse, sizeof(double)* NERUN* row_Data_train);
cudaMalloc((void **)&d_a_n, sizeof(double)* NERUN* col_Xtrain);
cudaMalloc((void **)&d_H_a_n, sizeof(double)* row_Data_train* col_Xtrain);

//Transfer Data from Host To Device
thrust::device_vector<double> ix_train(x_train);
double* d_x_train = thrust::raw_pointer_cast(&ix_train[0]);
//////////////////////////////////////////------------call funcation-----------//////////////////////////////////////////
encoder(x_train, row_Xtrain, col_Xtrain, a_f,row_af, col_af, b_f, H_F, "sigmoid");
// transfer data from host to device
cudaMemcpy(d_H_F, H_F.data(), sizeof(double)* H_F.size(), cudaMemcpyHostToDevice);
Transpose(d_H_f_T, d_H_F, row_Data_train, NERUN);
if (N > nerun){
multi(d_H_f_T, d_H_F, d_H_H_T1, NERUN, row_Data_train, row_Data_train, NERUN, NERUN, NERUN);
unit_matrix_cpu(d_I, NERUN, NERUN);
C = pow(10, 6);
divisional_cpu(C, d_I, NERUN, NERUN);
SUM2D_cpu(d_H_H_T1, d_I, d_H_H_T_I, NERUN, NERUN);
inv = inverse(H_H_T_I, NERUN);
cudaMemcpy(d_inv, inv, sizeof(double)* NERUN* NERUN, cudaMemcpyHostToDevice);
multi(d_inv, d_H_f_T, d_HF_sudoinverse, NERUN, NERUN, NERUN, row_Data_train, NERUN, row_Data_train);
cudaMemcpyDeviceToHost);
}

if (N < nerun){
multi(d_H_F, d_H_f_T, d_H_H_T2, row_Data_train, NERUN, NERUN, row_Data_train, row_Data_train, row_Data_train );
unit_matrix_cpu(d_I, NERUN, NERUN);
C = pow(10, 6);
divisional_cpu(C, d_I, NERUN, NERUN);
SUM2D_cpu(d_H_H_T2, d_I, d_H_H_T_I, NERUN, NERUN);
inv = inverse(d_H_H_T_I, NERUN);
cudaMemcpy(d_inv, inv, sizeof(double)* NERUN* NERUN, cudaMemcpyHostToDevice);
multi(d_inv, d_H_f_T, d_HF_sudoinverse, NERUN, NERUN, NERUN, row_Data_train, NERUN, row_Data_train);
}

if (G == "sinus"){
MAX_MIN_Matrix(d_x_train, row_Xtrain * col_Xtrain, &Max, &Min);
Norm_alize_cpu(d_x_train, d_x_train_norm, Min, Max, row_Xtrain * col_Xtrain);
arcsinus(d_x_train_norm, d_x_train_n_a, row_Xtrain *col_Xtrain);
multi(d_HF_sudoinverse, d_x_train_n_a, d_a_n, NERUN, row_Data_train, row_Xtrain, col_Xtrain, NERUN, col_Xtrain);
}

if (G == "sigmoid"){
MAX_MIN_Matrix(d_x_train, row_Xtrain * col_Xtrain, &Min, &Max);
Norm_alize_cpu(d_x_train, d_x_train_norm, Min, Max, row_Xtrain * col_Xtrain);
negative_log(d_x_train_norm, d_x_train_n_L, row_Xtrain * col_Xtrain);
multi(d_HF_sudoinverse, d_x_train_n_L, d_a_n, NERUN, row_Data_train, row_Xtrain, col_Xtrain, NERUN, col_Xtrain);
}
multi(d_H_F, d_a_n, d_H_a_n, row_Data_train, NERUN, NERUN, col_Xtrain, row_Data_train, col_Xtrain);
MSE(d_H_a_n, d_x_train_n_L, row_Data_train * col_Xtrain, d_mean);
cudaMemcpy(&h_answer, d_mean, sizeof(double), cudaMemcpyDeviceToHost);
printf("h_answer: %f \n", h_answer);
double b = sqrt(h_answer);
printf("b: %f \n", b);
/////////////////////////////////////////-----------------end---------------------///////////////////////////////////////
cudaFree(d_I);
}

void Autoencoder(vector<double>X_train){
vector<double>a_f(COL* NERUN);
vector<double>b_f(NERUN);
vector<double>H_F(row_Data_train * NERUN);
vector<double>HF_sudoinverse(NERUN * row_Data_train);
vector<double>b_n(NERUN);
vector<double>a_n;
a_f = rand_data(COL_NERUN);
b_f = rand_data(NERUN);
decoder(X_train, row_Data_train, COL, a_f, COL, NERUN, b_f, H_F, HF_sudoinverse, a_n, NERUN, COL , b_n, "sigmoid", 300, 200);
}

Autoencoder.h

#ifndef AUTOENCODER_H_
#define AUTOENCODER_H_
void Autoencoder(std::vector<double> X_train);
void encoder(const std::vector<double>x_train , int row_Xtrain, int col_Xtrain, std::vector< double> a_f, int row_af, int col_af, const std::vector<double>b_f,std::vector< double> H_F, std::string G);
void decoder(const std::vector<double>x_train, int row_Xtrain, int col_Xtrain, const std::vector<double>a_f, int row_af, int col_af, const std::vector<double> b_f, std::vector<double> H_F, std::vector<double>HF_sudoinverse, std::vector<double>a_n, int row_an, int col_an, std::vector<double>b_n, std::string G, int N, int nerun);
#endif

kernel.cu

#include <thrust\device_vector.h>
#include <vector>
#include <cublas_v2.h>
#include <curand.h>
#include "kernel.cuh"
#pragma once

using namespace std ;

#define row_liver 345
#define col_liver 7
#define data_train 242
#define data_test 103
#define row_liver_col_liver 2415
#define row_x_train 242 * 7
#define BLOCK_DIM 16

// ba estefade az in tabe maghadir normalize ra bedast miavrim ke bayad ebtda max meghdar dar har sotun ra be dast avarim..

__global__ void Normalize_kernel(double *input1 ,double *input2 ,double *output , int row , int col){
int id =blockIdx.x * blockDim.x + threadIdx.x;
if( id < row_liver_col_liver )
output[ id ] = input1[id] / input2[id] ;

}

// ba estefade az in tabe max meghdar dar har sotun ra be dast miavrim
//dar in tabe az library cublas estefade shude

double* MAX_VALUE_ECOL(double *X){

thrust::host_vector<double> h_data;
thrust::device_vector<double>d_data;
// vector<double>value_result(col_liver);
double* value_result =(double*)malloc( col_liver * sizeof(double));
double* result_h_data =(double*)malloc( row_liver* col_liver * sizeof(double));

for(int i=0; i< row_liver_col_liver ; i++){
h_data.push_back(X);
d_data.push_back(h_data);
}

cublasHandle_t handle;
cublasCreate(&handle);

int result;
for (int i=0; i< col_liver ; i++) {
cublasIdamax(handle, row_liver , (double*)thrust::raw_pointer_cast(d_data.data()) + i, col_liver , &result);
// printf("%i %f\n",result,h_data[i+(result-1)* col_liver]);
value_result = h_data[i+(result-1)* col_liver] ;
// printf(" %f\n",value_result);
}
return value_result;
}

extern "C"
void Normalize_cpu(double *input1 ,double *input2 ,double *output , int row , int col) {
int n = row_liver * col_liver;
int blockSize = 512;
int gridSize = (int) ceil((float) n / blockSize);
Normalize_kernel<<< gridSize , blockSize >>>(input1 ,input2 ,output , row , col);
}
/*int k=0;
for(int i=0 ; i<h_data.size() ;i++){
if( k == col_liver ){
k=0;}
result_h_data = h_data / value_result [k];
k++;
}
for(int i=0 ; i<h_data.size() ;i++){
printf("%f \n", result_h_data); }

return result_h_data;

}*/
//braye inkae betavan tabe Normalize ra movazi kard in vecetor ra be andaze data set afzayesh mydahim yani inkae dar brodar X faghat 7maghdar max dar har sotun vojud dard ke baestefade az in tabe vector be andaze voridi durst kardim
extern "C"
double* vector_big(double* X){
double *MV = (double*)malloc(row_liver* col_liver * sizeof(double));
int j = 0;
for (int i = 0; i < row_liver* col_liver; i++){
if (j >= col_liver){
j = 0;
}
MV = X[j];
j++;
}
return MV;
}
extern "C"
void data(double* X , double* Y_train ,double* Y_test ,double* X_train, double* X_test) {
double mazrab[row_liver];
int j = 0 , l=0 , m=0 , n=0 , w=0;
mazrab[w] = col_liver -1;
for (int r=0 ; r < (row_liver * col_liver) -1 ; ){
r = mazrab[w] + col_liver;
w++;
mazrab[w] = r ;
}
int k=0;
for(int i=0 ; i<row_liver * col_liver ; i++ ){
if (i<row_x_train && i!= mazrab[k]){
X_train[j] = X;
j++;
}
if (i<row_x_train && (i == mazrab[k]) ){
Y_train[l] = X;
l++;
k++;
}
if(i >=row_x_train && i != mazrab[k]){
X_test[m] = X;
m++;
}
if(i >=row_x_train && (i == mazrab[k])){
Y_test[n] = X ;
n++;
k++;
}
}
}
extern "C"
void CLASStovector (double *array ,double *result , int size_array){
int j=0 ;
for(int i=0 ; i<size_array ; i++){
if (array ==0.5){
result [j] = 1;
result [j+1] = 0;
j+=2; }
if (array == 1){
result[j] = 0;
result[j+1] = 1;
j+=2; }
}
}
extern "C"
double* index_Max(double* input ,int size_matrix , int classs){
double* arg_max = (double*)malloc(data_train* sizeof(double));
int j=0 ,k=0 , count =-1 , number=0 ;
double Max =-1;
for(int i=0 ; i< size_matrix +1 ; i++ ){
count++;
if(count == classs){
j=0;
Max=-1;
count=0;
arg_max[k] = number;
k++;

}
if( j < classs && input > Max) {
Max = input ;
number = i ;
j++ ;
}

}
return arg_max;
}
extern "C++"
std::vector<double> rand_data(int size_matrix){

size_t n = size_matrix;
size_t i;
curandGenerator_t gen;
double *devData ;
double mean ,stddev ;
// double *input = (double*)malloc(n* sizeof(double));
vector<double>input(size_matrix);

/* Allocate n floats on device */
cudaMalloc((void **)&devData, n*sizeof(double));

/* Create pseudo-random number generator */
curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);

/* Set seed */
curandSetPseudoRandomGeneratorSeed(gen, 1234ULL);

/* Generate n floats on device */
curandGenerateNormalDouble(gen, devData, n, 0.0,1.0);

/* Copy device memory to host */
cudaMemcpy(input.data(), devData, n * sizeof(double),
cudaMemcpyDeviceToHost);

/* Show result */
/* for(i = 0; i < n; i++) {
printf("i= %d , %1.20f ",i, input);
}
printf("\n");*/

/* Cleanup */
curandDestroyGenerator(gen);
cudaFree(devData);
return input;
}
// Compute C = A * B
//dar
__global__ void matrixMultiplyShared(double * A, double * B, double * C,
int numARows, int numAColumns,
int numBRows, int numBColumns,
int numCRows, int numCColumns)
{
__shared__ double sA[32][32 + 1]; // Tile size of 32x32
__shared__ double sB[32][32 + 1];

int Row = blockDim.y*blockIdx.y + threadIdx.y;
int Col = blockDim.x*blockIdx.x + threadIdx.x;
double Cvalue = 0.0;
sA[threadIdx.y][threadIdx.x] = 0.0;
sB[threadIdx.y][threadIdx.x] = 0.0;

for (int k = 0; k < (((numAColumns - 1)/ 32) + 1); k++)
{
if ( (Row < numARows) && (threadIdx.x + (k*32)) < numAColumns)
{
sA[threadIdx.y][threadIdx.x] = A[(Row*numAColumns) + threadIdx.x + (k*32)];
}
else
{
sA[threadIdx.y][threadIdx.x] = 0.0;
}
if ( Col < numBColumns && (threadIdx.y + k*32) < numBRows)
{
sB[threadIdx.y][threadIdx.x] = B[(threadIdx.y + k*32)*numBColumns + Col];
}
else
{
sB[threadIdx.y][threadIdx.x] = 0.0;
}
__syncthreads();

for (int j = 0; j < 32; ++j)
{
Cvalue += sA[threadIdx.y][j] * sB[j][threadIdx.x];
}
}
if (Row < numCRows && Col < numCColumns)
{
C[Row*numCColumns + Col] = Cvalue;
}
}
extern "C"
void multi(double * A, double * B, double * C, int numARows,
int numAColumns, int numBRows, int numBColumns,
int numCRows, int numCColumns)
{
// Initialize the grid and block dimensions
dim3 dimBlock(32, 32, 1);
dim3 dimGrid((numCColumns / 32) + 1, (numCRows / 32) + 1, 1);

//@@ Launch the GPU Kernel here
matrixMultiplyShared << <dimGrid, dimBlock >> >(A, B, C, numARows, numAColumns, numBRows, numBColumns, numCRows, numCColumns);

}
//ba estefadeh az in kernel majmohe yek matrix 2d ra ba yek vectoer be dast miavrim yani har sater matrix ba vector jam mishavad..

__global__ void SUM_kernel( double* matrix, const double* vector , double *output ,const unsigned int size )
{
// get the current element index for the thread
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size)
{
// sum the current element with the
output[idx] = matrix[idx] + vector[threadIdx.x];
}
}

extern "C"
void SUM ( double* matrix, const double* vector , double* output ,const unsigned int size , int row_out , int col_out)
{
int gridSize = row_out ;
int blockSize = col_out;

SUM_kernel<<< gridSize , blockSize >>>( matrix , vector , output , size );
}

__global__ void Sigmoid_kernel(double* input, double* output ,const unsigned int size)
{
// get the current element index for the thread
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size)
{
//az har element sigmoid migirim
output[idx] = ( 1 / (1 + exp((-1) * input [idx])));
}
}

extern "C"
void Sigmoid(double* input, double* output , const unsigned int size)
{
int blockSize = 1024;
int gridSize = (int) ceil((float) size / blockSize);

Sigmoid_kernel<<< gridSize , blockSize >>>( input , output , size );
}

__global__ void sinus_kernel(double *input , double *output , int size)
{
// get the current element index for the thread
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size)
{
// az har element sin migigrim
output[idx] = sin( input[idx] );
}
}

extern "C"
void sinus(double *input , double *output , int size)
{
int blockSize = 1024;
int gridSize = (int) ceil((float) size / blockSize);

sinus_kernel<<< gridSize , blockSize >>>( input , output , size );

}

__global__ void transpose_kernel(double *odata, double *idata, int width, int height)
{
__shared__ double block[BLOCK_DIM][BLOCK_DIM+1];

// read the matrix tile into shared memory
// load one element per thread from device memory (idata) and store it
// in transposed order in block[][]
unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x;
unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y;
if((xIndex < width) && (yIndex < height))
{
unsigned int index_in = yIndex * width + xIndex;
block[threadIdx.y][threadIdx.x] = idata[index_in];
}

// synchronise to ensure all writes to block[][] have completed
__syncthreads();

// write the transposed matrix tile to global memory (odata) in linear order
xIndex = blockIdx.y * BLOCK_DIM + threadIdx.x;
yIndex = blockIdx.x * BLOCK_DIM + threadIdx.y;
if((xIndex < height) && (yIndex < width))
{
unsigned int index_out = yIndex * height + xIndex;
odata[index_out] = block[threadIdx.x][threadIdx.y];
}
}


extern "C"
void Transpose(double *odata, double *idata, int width, int height)
{

// setup execution parameters
int gridSize_x = (int) ceil((float) width / BLOCK_DIM);
int gridSize_y = (int) ceil((float) height / BLOCK_DIM);
dim3 grid(gridSize_x, gridSize_y , 1);
dim3 threads(BLOCK_DIM, BLOCK_DIM, 1);

transpose_kernel<<< grid, threads >>>(odata, idata, width , height );

}

__global__ void unit_matrix_kernel(double *I, int numR, int numC) {

int x = blockDim.x*blockIdx.x + threadIdx.x;
int y = blockDim.y*blockIdx.y + threadIdx.y;
if(y < numR && x < numC) {
if(x == y)
I[numR * y + x ] =1;

if( x != y)
I[numR * y + x ] =0;
}
}
extern "C"
void unit_matrix_cpu(double *I ,int ROW ,int COL ) {

int blockSize = 32;
int gridSize_x = (int) ceil((float) COL / blockSize);
int gridSize_y = (int) ceil((float) ROW / blockSize);
dim3 dimGrid(gridSize_x , gridSize_y );
//col , row dar inja be andaze satar & sotune matrix ast..
dim3 dimBlock( blockSize , blockSize );
unit_matrix_kernel<<<dimGrid,dimBlock>>>(I , ROW , COL);
}

__global__ void divisional_Kernel(int adad , double *I, int numR, int numC) {

int x = blockDim.x*blockIdx.x + threadIdx.x;
int y = blockDim.y*blockIdx.y + threadIdx.y;
if(y < numR && x < numC) {
if(x == y){
I[numR * y + x ] = I[numR * y + x ] / adad ;
}
}
}
extern "C"
void divisional_cpu(int adad , double *I, int ROW ,int COL) {

int blockSize = 32;
int gridSize_x = (int) ceil((float) COL / blockSize);
int gridSize_y = (int) ceil((float) ROW / blockSize);
dim3 dimGrid(gridSize_x , gridSize_y );
//col , row dar inja be andaze satar & sotune matrix ast..
dim3 dimBlock( blockSize , blockSize );
divisional_Kernel<<<dimGrid,dimBlock>>>(1000000,I , ROW , COL);
}

// grid 2D block 2D
__global__ void SUM2D_kernel(double *A, double *B, double *C, int nx , int ny)
{
unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int idx = iy * nx + ix;
if (iy < nx && ix < ny)
C[idx] = A[idx] + B[idx];
}
extern "C"
void SUM2D_cpu(double *A, double *B, double *C, const int nx, const int ny)
{
int dimx = 32;
int dimy = 32;
dim3 block(dimx, dimy);
dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);

SUM2D_kernel<<<grid, block>>>(A, B, C, nx, ny);
}

extern "C"
double* inverse(double* L, int n)
{
cublasHandle_t cu_cublasHandle;
cublasCreate(&cu_cublasHandle);
double** adL;
double** adC;
double* dL;
double* dC;
int* dLUPivots;
int* dLUInfo;

size_t szA = n * n * sizeof(double);

cudaMalloc(&adL, sizeof(double*));
cudaMalloc(&adC, sizeof(double*));
cudaMalloc(&dL, szA);
cudaMalloc(&dC, szA);
cudaMalloc(&dLUPivots, n * sizeof(int));
cudaMalloc(&dLUInfo, sizeof(int));
cudaMemcpy(dL, L, szA, cudaMemcpyHostToDevice);
cudaMemcpy(adL, &dL, sizeof(double*), cudaMemcpyHostToDevice);
cudaMemcpy(adC, &dC, sizeof(double*), cudaMemcpyHostToDevice);

cublasDgetrfBatched(cu_cublasHandle, n, adL, n, dLUPivots, dLUInfo, 1);
cudaDeviceSynchronize();

cublasDgetriBatched(cu_cublasHandle, n, (const double **)adL, n, dLUPivots, adC, n, dLUInfo, 1);
cudaDeviceSynchronize();


double* res = (double*)malloc(szA);

cudaMemcpy(res,dC , szA, cudaMemcpyDeviceToHost);

cudaFree(adL);
cudaFree(adC);
cudaFree(dL);
cudaFree(dC);
cudaFree(dLUPivots);
cudaFree(dLUInfo);
cublasDestroy(cu_cublasHandle);

return res;
}

//ba estefadeh az in funcation value max & min ra dar tamame matrix be dast miavarim ...
extern "C"
void MAX_MIN_Matrix(double *input ,int size_matrix ,double *min , double *max ){

thrust::device_vector<double> d_A(size_matrix);
thrust::copy(input, input + size_matrix, d_A.begin());
thrust::minimum<double> op;
thrust::maximum<double> op1;
*min = thrust::reduce(d_A.begin(), d_A.end(), 1000000, op);
*max = thrust::reduce(d_A.begin(), d_A.end(), -1000000, op1);
}

__global__ void Norm_alize_kernel(double *input, double *output, double min , double max ,int size)
{
double a = 0.1 , b = 0.9 ;
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size)
output[idx] = (( input[idx] - min ) /( max - min )) * (b - a) + a;

}
extern "C"
void Norm_alize_cpu(double *input, double *output, double min , double max ,int size)
{
int blockSize = 512;
int gridSize = (int) ceil((float) size / blockSize);

Norm_alize_kernel<<<gridSize, blockSize>>>(input, output, min,max , size);

}
__global__ void negative_log_kernel(double *input ,double *output ,int size_matrix)
{
// get the current element index for the thread
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size_matrix)
{
output[idx] = (-1) * log (( 1/input[idx]) - 1);
}
}


extern "C"
void negative_log (double *input ,double *output ,int size_matrix)
{

int blockSize = 512;
int gridSize = (int) ceil((float) size_matrix / blockSize);

negative_log_kernel<<<gridSize, blockSize>>>( input , output , size_matrix);

}



__global__ void arcsinus_kernel(double *input ,double *output ,int size_matrix)
{
// get the current element index for the thread
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size_matrix)
{
output[idx] = asin(input[idx]);
}
}


extern "C"
void arcsinus (double *input ,double *output ,int size_matrix)
{

int blockSize = 512;
int gridSize = (int) ceil((float) size_matrix / blockSize);

arcsinus_kernel<<<gridSize, blockSize>>>( input , output , size_matrix);
}


__global__ void MSE_kernel(double* input1, double* input2, int size_matrix, double *mse){
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size_matrix)
atomicAdd(mse,(((input1[idx] - input2[idx])*(input1[idx] - input2[idx]))/size_matrix) );

}

extern "C"
void MSE(double *input1, double *input2, int size_matrix, double *mse)
{

int blockSize = 512;
int gridSize = (int)ceil((float)size_matrix / blockSize);

MSE_kernel << <gridSize, blockSize >> >(input1, input2, size_matrix, mse);

}

kernel.cuh

#ifndef KERNEL_CUH_
#define KERNEL_CUH_


extern "C"
double* MAX_VALUE_ECOL(double *X);
extern "C"
double* vector_big(double* X);
extern "C"
void Normalize_cpu(double *input1, double *input2, double *output, int row, int col);
extern "C"
void data(double* X, double* Y_train, double* Y_test, double* X_train, double* X_test);
extern "C"
void CLASStovector(double *array, double *result, int size_array);
extern "C"
double* index_Max(double* input, int size_matrix, int classs);
extern "C++"
std::vector<double> rand_data( int size_matrix);
extern "C"
void multi(double * A, double * B, double * C, int numARows,int numAColumns, int numBRows, int numBColumns,int numCRows, int numCColumns);
extern "C"
void SUM(double* matrix, const double* vector, double* output, const unsigned int size, int row_out, int col_out);
extern "C"
void Sigmoid(double* input, double* output, const unsigned int size);
extern "C"
void sinus(double *input, double *output, int size);
extern "C"
void Transpose(double *odata, double *idata, int width, int height);
extern "C"
void unit_matrix_cpu(double *I, int ROW, int COL);
extern "C"
void divisional_cpu(int adad, double *I, int ROW, int COL);
extern "C"
void SUM2D_cpu(double *A, double *B, double *C, const int nx, const int ny);
extern "C"
double* inverse(double* L, int n);
extern "C"
void MAX_MIN_Matrix(double *input, int size_matrix , double *min, double *max);
extern "C"
void Norm_alize_cpu(double *input, double *output, double min, double max, int size);
extern "C"
void negative_log(double *input, double *output, int size_matrix);
extern "C"
void arcsinus(double *input, double *output, int size_matrix);
extern "C"
void MSE(double *input1, double *input2, int size_matrix, double *mse);


#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600

#else
static __inline__ __device__ double atomicAdd(double *address, double val) {
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
if (val == 0.0)
return __longlong_as_double(old);
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
#endif

#endif

main.cpp

using namespace std;
#include"iostream"
#include<stdio.h>
#include<stdlib.h>
#include<math.h>
#include<vector>
#include"liver.h"
#include<float.h>
#include<string>
#include <random>
#include "kernel.cuh"
#include "Autoencoder.h"
#include <windows.h>
#include <cuda.h>
#include <iostream>
#include <time.h>
#include <chrono>
#include <algorithm>
#include <cuda_runtime.h>
#pragma comment(lib,"user32.lib")
using namespace std;
#define PI 3.141592654
#pragma once

#define row_liver 345
#define col_liver 7
#define data_train 242
#define data_test 103
#define col 6
#define cLass 2
#define nerun 200
#define data_train_cLass 484
#define data_test_cLass 206
void main(){
double *X = (double*)malloc(row_liver* col_liver * sizeof(double));
double *X_out = (double*)malloc(row_liver* col_liver * sizeof(double));
//MV_EC Max value meghdar dar har sotun ra moshakhas mikonad.
double *MV_EC = (double*)malloc(col_liver * sizeof(double));
double *MV = (double*)malloc(row_liver* col_liver * sizeof(double));
//
double *x_train = (double*)malloc(data_train* col * sizeof(double));
//test
vector<double>X_train(data_train* col);
double *label_train = (double*)malloc(data_train* sizeof(double));
double *Y_train = (double*)malloc(data_train* cLass * sizeof(double));
double *Max_Index_Y_train = (double*)malloc(data_train* sizeof(double));
double *Max_Index_Y_test = (double*)malloc(data_test* sizeof(double));
double *x_test = (double*)malloc(data_test* col * sizeof(double));
//test
vector<double>X_test(data_test* col);
double *label_test = (double*)malloc(data_test* sizeof(double));
double *Y_test = (double*)malloc(data_test* cLass * sizeof(double));
//
double *d_X, *d_MV, *d_Xout;
cudaMalloc((void **)&d_X, sizeof(double)* row_liver* col_liver);
cudaMalloc((void **)&d_MV, sizeof(double)*row_liver* col_liver);
cudaMalloc((void **)&d_Xout, sizeof(double)* row_liver* col_liver);
//data set ra load mikonim
load_data_Set_liver(X);
MV_EC = MAX_VALUE_ECOL(X);
MV = vector_big(MV_EC);

// transfer data from host to device
cudaMemcpy(d_X, X, sizeof(double)* row_liver* col_liver, cudaMemcpyHostToDevice);
cudaMemcpy(d_MV, MV, sizeof(double)* row_liver* col_liver, cudaMemcpyHostToDevice);
//call
Normalize_cpu(d_X, d_MV, d_Xout, row_liver, col_liver);
// device
cudaMemcpy(X_out, d_Xout, sizeof(double)* row_liver* col_liver, cudaMemcpyDeviceToHost);
/*for (int i = 0; i < row_liver* col_liver; i++)
printf("%f\n", X_out);*/
data(X_out, label_train, label_test, x_train, x_test);
//test
cudaMemcpy(X_train.data(), x_train, sizeof(double)* data_train* col, cudaMemcpyHostToHost);
cudaMemcpy(X_test.data(), x_test, sizeof(double)* data_test* col, cudaMemcpyHostToHost);
//
CLASStovector(label_train, Y_train, data_train_cLass);
CLASStovector(label_test, Y_test, data_test_cLass);
Max_Index_Y_train = index_Max(Y_train, data_train_cLass, cLass);
Max_Index_Y_test = index_Max(Y_test, data_test_cLass, cLass);

Autoencoder(X_train);


cudaFree(d_X);
cudaFree(d_MV);
cudaFree(d_Xout);
//
free(X_out);
getchar();
}

also, I added libraries directly in the path
myproject-> properties-> linker-> input->Additional Dependencies
cudart_static.lib
kernel32.lib
user32.lib
gdi32.lib
winspool.lib
cublas.lib
comdlg32.lib
advapi32.lib
shell32.lib
ole32.lib
oleaut32.lib
uuid.lib
odbc32.lib
odbccp32.lib
curand.lib

but I don't know what is the problem when compiling the program, could you help me, please?

Continue reading...
 
Back
Top