// Please choose a data type to compile #define DATATYPE 0 #if DATATYPE==0 #pragma message "Compiling using StorageT=half ComputeT=float" #define StorageT half #define ComputeT float #define sizeofStorageT 2 #define sizeofComputeT 4 #define CUDNNStorageT CUDNN_DATA_HALF #define CUDNNConvComputeT CUDNN_DATA_FLOAT #define CPUStorage2ComputeT(x) (cpu_half2float(x)) #define CPUCompute2StorageT(x) (cpu_float2half(x)) #define GPUStorage2ComputeT(x) (__half2float(x)) #define GPUCompute2StorageT(x) (__float2half(x)) #define GPUgemm Hgemm #define GPUasum Hasum #define ISNAN(x) (ishnan(x)) #define ComputeT_MIN FLT_MIN #elif DATATYPE==1 #pragma message "Compiling using StorageT=float ComputeT=float" #define StorageT float #define ComputeT float #define sizeofStorageT 4 #define sizeofComputeT 4 #define CUDNNStorageT CUDNN_DATA_FLOAT #define CUDNNConvComputeT CUDNN_DATA_FLOAT #define CPUStorage2ComputeT(x) (x) #define CPUCompute2StorageT(x) (x) #define GPUStorage2ComputeT(x) (x) #define GPUCompute2StorageT(x) (x) #define GPUgemm cublasSgemm #define GPUasum cublasSasum #define ISNAN(x) (std::isnan(x)) #define ComputeT_MIN FLT_MIN #elif DATATYPE==2 #pragma message "Compiling using StorageT=double ComputeT=double" #define StorageT double #define ComputeT double #define sizeofStorageT 8 #define sizeofComputeT 8 #define CUDNNStorageT CUDNN_DATA_DOUBLE #define CUDNNConvComputeT CUDNN_DATA_DOUBLE #define CPUStorage2ComputeT(x) (x) #define CPUCompute2StorageT(x) (x) #define GPUStorage2ComputeT(x) (x) #define GPUCompute2StorageT(x) (x) #define GPUgemm cublasDgemm #define GPUasum cublasDasum #define ISNAN(x) (std::isnan(x)) #define ComputeT_MIN DBL_MIN #endif ////////////////////////////////////////////////////////////////////////////////////////////////// // Includes ////////////////////////////////////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include ////////////////////////////////////////////////////////////////////////////////////////////////// // Debugging utility ////////////////////////////////////////////////////////////////////////////////////////////////// void FatalError(const int lineNumber=0) { std::cerr << "FatalError"; if (lineNumber!=0) std::cerr<<" at LINE "< 0x7f800000) { ret.x = 0x7fffU; return ret; } sign = ((x >> 16) & 0x8000); // Get rid of +Inf/-Inf, +0/-0. if (u > 0x477fefff) { ret.x = sign | 0x7c00U; return ret; } if (u < 0x33000001) { ret.x = (sign | 0x0000); return ret; } exponent = ((u >> 23) & 0xff); mantissa = (u & 0x7fffff); if (exponent > 0x70) { shift = 13; exponent -= 0x70; } else { shift = 0x7e - exponent; exponent = 0; mantissa |= 0x800000; } lsb = (1 << shift); lsb_s1 = (lsb >> 1); lsb_m1 = (lsb - 1); // Round to nearest even. remainder = (mantissa & lsb_m1); mantissa >>= shift; if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) { ++mantissa; if (!(mantissa & 0x3ff)) { ++exponent; mantissa = 0; } } ret.x = (sign | (exponent << 10) | mantissa); return ret; } float cpu_half2float(half h) { unsigned sign = ((h.x >> 15) & 1); unsigned exponent = ((h.x >> 10) & 0x1f); unsigned mantissa = ((h.x & 0x3ff) << 13); if (exponent == 0x1f) { /* NaN or Inf */ mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0); exponent = 0xff; } else if (!exponent) { /* Denorm or Zero */ if (mantissa) { unsigned int msb; exponent = 0x71; do { msb = (mantissa & 0x400000); mantissa <<= 1; /* normalize */ --exponent; } while (!msb); mantissa &= 0x7fffff; /* 1.mantissa is implicit */ } } else { exponent += 0x70; } int temp = ((sign << 31) | (exponent << 23) | mantissa); return *((float*)((void*)&temp)); } bool operator <(const half& x, const half& y) { return cpu_half2float(x) < cpu_half2float(y); } std::ostream& operator<< (std::ostream& stream, const half& x) { stream << cpu_half2float(x); return stream; } ////////////////////////////////////////////////////////////////////////////////////////////////// // File format ////////////////////////////////////////////////////////////////////////////////////////////////// void memorySizePrint(size_t bytes){ if (bytes<512){ std::cout<& v){ std::cout<<"["<0) std::cout<1){ for (int i=1;i& dim){ size_t res = 1; for (int i=0;i& dim){ size_t res = 1; for (int i=1;i& dim){ size_t res = 1; for (int i=2;i class Tensor{ public: std::vector dim; T* CPUmem; std::string name; // compile will check if your time is not correct for writeGPU and readGPU void writeGPU(T* GPUmem){ cudaMemcpy(GPUmem, CPUmem, numel()*sizeof(T), cudaMemcpyHostToDevice); }; void readGPU(T* GPUmem){ cudaMemcpy(CPUmem, GPUmem, numel()*sizeof(T), cudaMemcpyDeviceToHost); }; Tensor(): CPUmem(NULL){}; size_t numel(){ return ::numel(dim); }; size_t numBytes(){ return sizeof(T)*numel(); }; int numofitems(){ return dim[0]; }; size_t sizeofitem(){ return ::sizeofitem(dim); }; ~Tensor(){ if (CPUmem!=NULL) delete[] CPUmem; }; void initialize(T val){ for (size_t i=0;i0){ read_cnt = fread((void*)(name.data()), sizeof(char), lenName, fp); if (read_cnt!=lenName) { std::cerr<<"Error at Tensor::readHeader: wrong data type. "<0){ read_cnt = fread((void*)(&dim[0]), sizeof(int), nbDims, fp); if (read_cnt!=nbDims) { std::cerr<<"Error at Tensor::readHeader: wrong data type. "<* floatTensor = new Tensor(fp); this->dim = floatTensor->dim ; this->name = floatTensor->name; Malloc(batch_size); for(size_t i=0; iCPUmem[i]); memcpy(((half*)(CPUmem))+i,&v,sizeof(half)); } delete floatTensor; }else if (myTypeid==typeID(typeid(float)) && fpTypeid==typeID(typeid(half))){ fseek(fp, -(sizeof(uint8_t)+sizeof(uint32_t)), SEEK_CUR); Tensor* halfTensor = new Tensor(fp); this->dim = halfTensor->dim ; this->name = halfTensor->name; Malloc(batch_size); for(size_t i=0; iCPUmem[i]); memcpy(((float*)(CPUmem))+i,&v,sizeof(float)); } delete halfTensor; }else if (myTypeid==typeID(typeid(double)) && fpTypeid==typeID(typeid(float))){ fseek(fp, -(sizeof(uint8_t)+sizeof(uint32_t)), SEEK_CUR); Tensor* floatTensor = new Tensor(fp); this->dim = floatTensor->dim ; this->name = floatTensor->name; Malloc(batch_size); for(size_t i=0; iCPUmem[i]); memcpy(((double*)(CPUmem))+i,&v,sizeof(double)); } delete floatTensor; }else if (myTypeid==typeID(typeid(float)) && fpTypeid==typeID(typeid(double))){ fseek(fp, -(sizeof(uint8_t)+sizeof(uint32_t)), SEEK_CUR); Tensor* doubleTensor = new Tensor(fp); this->dim = doubleTensor->dim ; this->name = doubleTensor->name; Malloc(batch_size); for(size_t i=0; iCPUmem[i]); memcpy(((float*)(CPUmem))+i,&v,sizeof(float)); } delete doubleTensor; }else if (myTypeid==typeID(typeid(half)) && fpTypeid==typeID(typeid(double))){ fseek(fp, -(sizeof(uint8_t)+sizeof(uint32_t)), SEEK_CUR); Tensor* doubleTensor = new Tensor(fp); this->dim = doubleTensor->dim ; this->name = doubleTensor->name; Malloc(batch_size); for(size_t i=0; iCPUmem[i])); memcpy(((half*)(CPUmem))+i,&v,sizeof(half)); } delete doubleTensor; }else if (myTypeid==typeID(typeid(float)) && fpTypeid==typeID(typeid(half))){ fseek(fp, -(sizeof(uint8_t)+sizeof(uint32_t)), SEEK_CUR); Tensor* halfTensor = new Tensor(fp); this->dim = halfTensor->dim ; this->name = halfTensor->name; Malloc(batch_size); for(size_t i=0; iCPUmem[i])); memcpy(((double*)(CPUmem))+i,&v,sizeof(double)); } delete halfTensor; }else{ std::cerr<<"Tensor conversion is not supported: from Type "<0){ read_cnt = fread((void*)(name.data()), sizeof(char), lenName, fp); if (read_cnt!=lenName) return NULL; } int nbDims; read_cnt = fread((void*)(&nbDims), sizeof(int), 1, fp); if (read_cnt!=1) return NULL; dim.resize(nbDims); if (nbDims>0){ read_cnt = fread((void*)(&dim[0]), sizeof(int), nbDims, fp); if (read_cnt!=nbDims) return NULL; } size_t n = numel(); Malloc(batch_size); read_cnt = fread((void*)(CPUmem), sizeof(T), n, fp); if (read_cnt!=n){ delete [] CPUmem; CPUmem = NULL; return NULL; } } return CPUmem; }; void Malloc(int batch_size){ size_t n = numel(); //std::cout<<" "; memorySizePrint(n*sizeof(T)); std::cout< dim2write){ uint8_t myTypeid = typeID(typeid(T)); fwrite((void*)(&myTypeid), sizeof(uint8_t), 1, fp); uint32_t typesizeof = uint32_t(sizeof(T)); fwrite((void*)(&typesizeof), sizeof(uint32_t), 1, fp); int lenName = name.size(); fwrite((void*)(&lenName), sizeof(int), 1, fp); if (lenName>0) fwrite((void*)(name.data()), sizeof(char), lenName, fp); int nbDims = dim2write.size(); fwrite((void*)(&nbDims), sizeof(int), 1, fp); if (nbDims>0) fwrite((void*)(&dim2write[0]), sizeof(int), nbDims, fp); if (ferror (fp)){ std::cerr << "disk writing failed"<0){ fwrite((void*)(CPUmem), sizeof(T), n, fp); if (ferror (fp)){ std::cerr << "disk writing failed" << std::endl; FatalError(); } } }; //support continuous write across many NdTensors //write with header void write(FILE* fp){ writeHeader(fp,dim); writeData(fp); }; void write(std::string filename){ FILE* fp = fopen(filename.c_str(),"wb"); while (fp==NULL) { std::cerr<<"Tensor::write: fail to open file "< dim_): dim(dim_){ CPUmem = new T [numel()]; }; Tensor(std::vector dim_, T initValue): dim(dim_){ int n = numel(); CPUmem = new T [n]; if (initValue == T(0)) memset(CPUmem, 0, n*sizeof(T)); else for (int i=0;i dim_): name(name_),dim(dim_){ CPUmem = new T [numel()]; }; void permute(std::vector v){ size_t nbItems = numofitems(); size_t sizeofitem_ = sizeofitem(); size_t nbBytes = sizeofitem_ * sizeof(T); T* CPUmemNew = new T[numel()]; memcpy(CPUmemNew, CPUmem, nbItems * nbBytes); for (size_t i=0;i display_dim){ std::cout<<" name:"<= N) return; for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){ GPUdst[idx] = value; } } void GPU_set_value(size_t N, StorageT* GPUdst, StorageT value){ Kernel_set_value<<>>(CUDA_GET_LOOPS(N),N,GPUdst,value); checkCUDA(__LINE__,cudaGetLastError()); } void GPU_set_ones(size_t N, StorageT* GPUdst){ GPU_set_value(N, GPUdst, CPUCompute2StorageT(1)); } void GPU_set_negones(size_t N, StorageT* GPUdst){ GPU_set_value(N, GPUdst, CPUCompute2StorageT(-1)); } void GPU_set_zeros(size_t N, StorageT* GPUdst){ GPU_set_value(N, GPUdst, CPUCompute2StorageT(0)); } __global__ void Kernel_integrate( unsigned int xSize, unsigned int ySize, unsigned int zSize, ComputeT xMin, ComputeT yMin, ComputeT zMin, ComputeT unit, ComputeT margin, unsigned int width, unsigned int height, const ComputeT* depth, const ComputeT* pose, const ComputeT* intrinsics, StorageT *tsdf, uint8_t *weight) { unsigned int x = blockIdx.x; unsigned int y = threadIdx.x; ComputeT xWorld = xMin + x * unit; ComputeT yWorld = yMin + y * unit; ComputeT zWorld = zMin; ComputeT xCamera = pose[0] * xWorld + pose[1] * yWorld + pose[2] *zWorld + pose[3]; ComputeT yCamera = pose[4] * xWorld + pose[5] * yWorld + pose[6] *zWorld + pose[7]; ComputeT zCamera = pose[8] * xWorld + pose[9] * yWorld + pose[10] *zWorld + pose[11]; ComputeT xDelta = pose[2] * unit; ComputeT yDelta = pose[6] * unit; ComputeT zDelta = pose[10] * unit; unsigned int idx_offset = x * ySize * zSize + y * zSize; for (unsigned int z = 0; z < zSize; ++z, xCamera += xDelta, yCamera += yDelta, zCamera += zDelta){ ComputeT xOzCamera = xCamera / zCamera; ComputeT yOzCamera = yCamera / zCamera; int px = roundf(intrinsics[0] * xOzCamera + intrinsics[2]); int py = roundf(intrinsics[4] * yOzCamera + intrinsics[5]); if (px < 0 || px >= width || py < 0 || py >= height) continue; ComputeT p_depth = *(depth + py * width + px); if (p_depth == 0.0) continue; ComputeT diff = (p_depth - zCamera) * sqrtf(1.0 + xOzCamera * xOzCamera + yOzCamera * yOzCamera); if(diff > -margin){ ComputeT v_new = fminf(1.0, diff/margin); //tsdf //v_new = 1.0 - fabs(v_new); // 1-tdf // comment this out if you want to use tsdf unsigned int idx = idx_offset + z; uint8_t w = weight[idx]; ComputeT v = GPUStorage2ComputeT(tsdf[idx]); tsdf[idx] = GPUCompute2StorageT(fmin(fmax((ComputeT(w)*v + v_new)/(ComputeT(w + 1)), -1.f), 1.f)); weight[idx] = min(w+1,254); } } } ////////////////////////////////////////////////////////////////////////////////////////////////// // main ////////////////////////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv){ if (argc < 5 || argc >13){ std::cout<<"Usage:"<* depthMaps_CPU = new Tensor(argv[1]); unsigned int numFrames = depthMaps_CPU->dim[0]; unsigned int width = depthMaps_CPU->dim[3]; unsigned int height = depthMaps_CPU->dim[2]; std::cout<<"depth maps ["<numBytes()) ); memoryBytes+=depthMaps_CPU->numBytes(); checkCUDA(__LINE__, cudaMemcpy(depthMaps_GPU, depthMaps_CPU->CPUmem, depthMaps_CPU->numBytes(), cudaMemcpyHostToDevice) ); delete depthMaps_CPU; Tensor* intrinsics_CPU = new Tensor(argv[2]); ComputeT* intrinsics_GPU; checkCUDA(__LINE__, cudaMalloc(&intrinsics_GPU, intrinsics_CPU->numBytes()) ); memoryBytes+=intrinsics_CPU->numBytes(); checkCUDA(__LINE__, cudaMemcpy(intrinsics_GPU, intrinsics_CPU->CPUmem, intrinsics_CPU->numBytes(), cudaMemcpyHostToDevice) ); delete intrinsics_CPU; Tensor* cameraRtW2C_CPU = new Tensor(argv[3]); ComputeT* cameraRtW2C_GPU; checkCUDA(__LINE__, cudaMalloc(&cameraRtW2C_GPU, cameraRtW2C_CPU->numBytes()) ); memoryBytes+=cameraRtW2C_CPU->numBytes(); checkCUDA(__LINE__, cudaMemcpy(cameraRtW2C_GPU, cameraRtW2C_CPU->CPUmem, cameraRtW2C_CPU->numBytes(), cudaMemcpyHostToDevice) ); delete cameraRtW2C_CPU; ComputeT xMin; xMin = (argc<=5? -0.05 : atof(argv[5] )); std::cout<<"xMin ="<>>(xSize, ySize, zSize, xMin, yMin, zMin, unit, margin, width, height, depthMaps_GPU+width*height*f, cameraRtW2C_GPU+3*4*f, intrinsics_GPU, tsdf_GPU, weight_GPU); } std::vector dim; dim.push_back(xSize); dim.push_back(ySize); dim.push_back(zSize); Tensor* tsdf_CPU = new Tensor(dim); tsdf_CPU->readGPU(tsdf_GPU); FILE* fp = fopen(argv[4],"wb"); tsdf_CPU->write(fp); fclose(fp); delete tsdf_CPU; if (tsdf_GPU!=NULL) checkCUDA(__LINE__, cudaFree(tsdf_GPU)); if (weight_GPU!=NULL) checkCUDA(__LINE__, cudaFree(weight_GPU)); if (depthMaps_GPU!=NULL) checkCUDA(__LINE__, cudaFree(depthMaps_GPU)); if (cameraRtW2C_GPU!=NULL) checkCUDA(__LINE__, cudaFree(cameraRtW2C_GPU)); if (intrinsics_GPU!=NULL) checkCUDA(__LINE__, cudaFree(intrinsics_GPU)); return 0; }