https://pastein.ru/t/kD

  скопируйте уникальную ссылку для отправки


#include <cstdlib>
#include <ostream>
#include <iostream>
#include <iomanip>
#include <chrono>
#include <stdio.h>
#include <time.h>
#include <cuda.h>
#include "cuda_runtime.h"
#include <cuda_runtime_api.h>
#include <device_functions.h>
#include "device_launch_parameters.h"

using namespace std;

double transform_matrix_cpu(
	int* first_matrix,
	int* second_matrix,
	const int matrix_height,
	const int matrix_width) {

	auto start_cpu = chrono::steady_clock::now();

	for (auto i = 0; i < matrix_height; i++)
		for (auto j = 0; j < matrix_width; j++) {
			second_matrix[i * matrix_width + j / 4 + j % 4 * matrix_width / 4] = first_matrix[i * matrix_width + j];
		}

	auto end_cpu = chrono::steady_clock::now();
	auto cpu_time = end_cpu - start_cpu;
	return chrono::duration <double, milli>(cpu_time).count();

}

__global__ void kernelGpu(
	int* first_matrix,
	int* second_matrix,
	const int matrix_height,
	const int matrix_width) {

	int width = blockIdx.x * blockDim.x + threadIdx.x;
	int height = blockIdx.y * blockDim.y + threadIdx.y;
	if (width >= matrix_width || height >= matrix_height)
		return;

	int offset = width % 4;
	int out_width = width / 4 + offset * matrix_width / 4;
	second_matrix[height * matrix_width + out_width] = first_matrix[height * matrix_width + width];

}

float transform_matrix_gpu(
	int* first_matrix,
	int* second_matrix,
	const int matrix_height,
	const int matrix_width) {

	cudaEvent_t startTime;
	cudaEvent_t stopTime;
	int* gpu_first_matrix;
	int* gpu_second_matrix;
	cudaMalloc((void**)&gpu_first_matrix, matrix_height * matrix_width * sizeof(int));
	cudaMemcpy(gpu_first_matrix, first_matrix, matrix_height * matrix_width * sizeof(int), cudaMemcpyHostToDevice);
	cudaMalloc((void**)&gpu_second_matrix, matrix_height * matrix_width * sizeof(int));
	dim3 grid;
	dim3 block(32, 32);
	grid.x = matrix_height / block.x;
	if (matrix_height % block.x != 0)
		grid.x += 1;

	grid.y = matrix_width / block.y;
	if (matrix_width % block.y != 0)
		grid.y += 1;

	cudaEventCreate(&startTime);
	cudaEventCreate(&stopTime);
	cudaEventRecord(startTime);
	kernelGpu << <grid, block >> > (
		gpu_first_matrix,
		gpu_second_matrix,
		matrix_height,
		matrix_width);

	cudaEventRecord(stopTime);
	cudaEventSynchronize(stopTime);
	float result_time;
	cudaEventElapsedTime(&result_time, startTime, stopTime);
	cudaMemcpy(second_matrix, gpu_second_matrix, matrix_height * matrix_width * sizeof(int), cudaMemcpyDeviceToHost);
	return result_time;
}

bool compare_matrix(int* first, int* second, int height, int width) {
	for (auto i = 0; i < height; i++)
		for (auto j = 0; j < width; j++)
			if (first[i * width + j] != second[i * width + j])
				return false;
	return true;
}

int* initialize_matrix(const int height, const int width) {
	const auto matrix = static_cast<int*>(calloc(height * width, sizeof(int)));
	return matrix;
}

void fill_random_matrix(int* matrix, int height, int width) {
	short initializer = 0;
	for (auto i = 0; i < height; i++)
		for (auto j = 0; j < width; j++)
			matrix[i * width + j] = rand() % 100 + 1;

}

void show_matrix(int* matrix, const int height, const int width) {
	int h = height > 16 ? 16 : height;
	int w = width > 16 ? 16 : width;
	for (auto i = 0; i < h; i++) {
		for (auto j = 0; j < w; j++)
			cout << setw(4) << matrix[i * width + j];
		cout << endl;
	}
}

int main() {
	int matrix_height;
	int matrix_width;
	cout << "Matrix height: ";
	cin >> matrix_height;
	cout << "Matrix width: ";
	cin >> matrix_width;

	matrix_width = matrix_width - matrix_width % 4 + (matrix_width % 4 != 0 ? 4 : 0);
	const auto first_matrix = initialize_matrix(matrix_height, matrix_width);
	auto second_matrix = initialize_matrix(matrix_height, matrix_width);
	auto third_matrix = initialize_matrix(matrix_height, matrix_width);
	fill_random_matrix(first_matrix, matrix_height, matrix_width);

	auto cpu_time = transform_matrix_cpu(
		first_matrix,
		second_matrix,
		matrix_height,
		matrix_width);

	auto gpu_time = transform_matrix_gpu(
		first_matrix,
		third_matrix,
		matrix_height,
		matrix_width);

	show_matrix(first_matrix, matrix_height, matrix_width);
	cout << endl;
	show_matrix(second_matrix, matrix_height, matrix_width);
	cout << endl;
	show_matrix(third_matrix, matrix_height, matrix_width);
	cout << endl;
	cout << "CPU Time: " << cpu_time << " ms." << endl;
	cout << "GPU Time: " << gpu_time << " ms." << endl;
	const string equal = compare_matrix(second_matrix, third_matrix, matrix_height, matrix_width) == 1 ? "true" : "false";
	cout << "Compare CPU and GPU - " << equal << endl;
	system("pause");
}