add warp_perspective kernel

dujw · dujw · commit f24e4921e23e · 2022-04-27T06:40:16.000Z
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -74,7 +74,9 @@
         "cinttypes": "cpp",
         "typeindex": "cpp",
         "valarray": "cpp",
-        "bit": "cpp"
+        "bit": "cpp",
+        "__functional_base": "cpp",
+        "locale": "cpp"
     },
     "workbench.tree.expandMode": "doubleClick"
 }
diff --git a/src/tensorRT/common/preprocess_kernel.cu b/src/tensorRT/common/preprocess_kernel.cu
@@ -105,6 +105,99 @@ namespace CUDAKernel{
 		*pdst_c2 = c2;
 	}
 
+	__global__ void warp_perspective_kernel(uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height, 
+		uint8_t const_value_st, float* warp_affine_matrix_3_3, Norm norm, int edge){
+
+		int position = blockDim.x * blockIdx.x + threadIdx.x;
+		if (position >= edge) return;
+
+		float m_x1 = warp_affine_matrix_3_3[0];
+		float m_y1 = warp_affine_matrix_3_3[1];
+		float m_z1 = warp_affine_matrix_3_3[2];
+
+		float m_x2 = warp_affine_matrix_3_3[3];
+		float m_y2 = warp_affine_matrix_3_3[4];
+		float m_z2 = warp_affine_matrix_3_3[5];
+
+        float m_x3 = warp_affine_matrix_3_3[6];
+		float m_y3 = warp_affine_matrix_3_3[7];
+		float m_z3 = warp_affine_matrix_3_3[8];
+
+		int dx      = position % dst_width;
+		int dy      = position / dst_width;
+
+        // 原图位置
+		float src_x = (m_x1 * dx + m_y1 * dy + m_z1)/(m_x3 * dx + m_y3 * dy + m_z3);
+		float src_y = (m_x2 * dx + m_y2 * dy + m_z2)/(m_x3 * dx + m_y3 * dy + m_z3);
+		float c0, c1, c2;
+
+		if(src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height){
+			// out of range
+			c0 = const_value_st;
+			c1 = const_value_st;
+			c2 = const_value_st;
+		}else{
+			int y_low = floorf(src_y);
+			int x_low = floorf(src_x);
+			int y_high = y_low + 1;
+			int x_high = x_low + 1;
+
+			uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
+			float ly    = src_y - y_low;
+			float lx    = src_x - x_low;
+			float hy    = 1 - ly;
+			float hx    = 1 - lx;
+			float w1    = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+			uint8_t* v1 = const_value;
+			uint8_t* v2 = const_value;
+			uint8_t* v3 = const_value;
+			uint8_t* v4 = const_value;
+			if(y_low >= 0){
+				if (x_low >= 0)
+					v1 = src + y_low * src_line_size + x_low * 3;
+
+				if (x_high < src_width)
+					v2 = src + y_low * src_line_size + x_high * 3;
+			}
+			
+			if(y_high < src_height){
+				if (x_low >= 0)
+					v3 = src + y_high * src_line_size + x_low * 3;
+
+				if (x_high < src_width)
+					v4 = src + y_high * src_line_size + x_high * 3;
+			}
+			
+			// same to opencv
+			c0 = floorf(w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0] + 0.5f);
+			c1 = floorf(w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1] + 0.5f);
+			c2 = floorf(w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2] + 0.5f);
+		}
+
+		if(norm.channel_type == ChannelType::Invert){
+			float t = c2;
+			c2 = c0;  c0 = t;
+		}
+
+		if(norm.type == NormType::MeanStd){
+			c0 = (c0 * norm.alpha - norm.mean[0]) / norm.std[0];
+			c1 = (c1 * norm.alpha - norm.mean[1]) / norm.std[1];
+			c2 = (c2 * norm.alpha - norm.mean[2]) / norm.std[2];
+		}else if(norm.type == NormType::AlphaBeta){
+			c0 = c0 * norm.alpha + norm.beta;
+			c1 = c1 * norm.alpha + norm.beta;
+			c2 = c2 * norm.alpha + norm.beta;
+		}
+
+		int area = dst_width * dst_height;
+		float* pdst_c0 = dst + dy * dst_width + dx;
+		float* pdst_c1 = pdst_c0 + area;
+		float* pdst_c2 = pdst_c1 + area;
+		*pdst_c0 = c0;
+		*pdst_c1 = c1;
+		*pdst_c2 = c2;
+	}
+
 	__global__ void warp_affine_bilinear_and_normalize_plane_kernel(uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height, 
 		uint8_t const_value_st, float* warp_affine_matrix_2_3, Norm norm, int edge){
 
@@ -394,6 +487,22 @@ namespace CUDAKernel{
 		));
 	}
 
+	void warp_perspective(
+        uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height,
+		float* matrix_3_3, uint8_t const_value, const Norm& norm, cudaStream_t stream
+    )
+    {   
+        int jobs   = dst_width * dst_height;
+		auto grid  = CUDATools::grid_dims(jobs);
+		auto block = CUDATools::block_dims(jobs);
+		
+		checkCudaKernel(warp_perspective_kernel << <grid, block, 0, stream >> > (
+			src, src_line_size,
+			src_width, src_height, dst,
+			dst_width, dst_height, const_value, matrix_3_3, norm, jobs
+		));
+    }
+
 	void resize_bilinear_and_normalize(
 		uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height,
 		const Norm& norm,
diff --git a/src/tensorRT/common/preprocess_kernel.cuh b/src/tensorRT/common/preprocess_kernel.cuh
@@ -50,6 +50,36 @@ namespace CUDAKernel{
         float* matrix_2_3, uint8_t const_value, const Norm& norm,
         cudaStream_t stream);
 
+    // 可以用来图像校正、图像旋转等等 (测试比cpu快10倍以上)
+    // 使用示范:
+    // float* matrix_3_3 = nullptr;
+    // size_t matrix_bytes = 3 * 3 * sizeof(f32);
+    // checkCudaRuntime(cudaMalloc(&matrix_3_3, matrix_bytes));
+    // checkCudaRuntime(cudaMemset(matrix_3_3, 0,  matrix_bytes));
+    //
+    // #左上、右上、右下、左下 原图像四个点的坐标
+    //    cv::Point2f src_points[] = { 
+    //    vctvctPoints[nImageIdx][0],
+    //    vctvctPoints[nImageIdx][1],
+    //    vctvctPoints[nImageIdx][2],
+    //    vctvctPoints[nImageIdx][3]};
+    // 
+    // #左上、右上、左下、右下（Z 字形排列） 目标图像四个点的坐标
+    //    cv::Point2f dst_points[] = {
+    //        cv::Point2f(0, 0),
+    //        cv::Point2f(nw-1, 0),
+    //        cv::Point2f(0, nh-1),
+    //        cv::Point2f(nw-1, nh-1) };
+    // 利用opencv 得到变换矩阵  dst -> src 的 矩阵
+    //    cv::Mat Perspect_Matrix = cv::getPerspectiveTransform(dst_points, src_points);
+    //    Perspect_Matrix.convertTo(Perspect_Matrix,  CV_32FC1);
+    // 拷贝到 gpu 
+    //    checkCudaRuntime(cudaMemcpy(matrix_3_3, Perspect_Matrix.data, matrix_bytes, cudaMemcpyHostToDevice));
+    void warp_perspective(
+        uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height,
+		float* matrix_3_3, uint8_t const_value, const Norm& norm, cudaStream_t stream
+    );
+
     void norm_feature(
         float* feature_array, int num_feature, int feature_length,
         cudaStream_t stream