!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Wed_Feb__8_05:53:42_Coordinated_Universal_Time_2023
Cuda compilation tools, release 12.1, V12.1.66
Build cuda_12.1.r12.1/compiler.32415258_0

!nvidia-smi

Tue Nov 12 13:12:43 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.99                 Driver Version: 555.99         CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA GeForce RTX 2070      WDDM  |   00000000:01:00.0  On |                  N/A |
| N/A   49C    P8             15W /  115W |     595MiB /   8192MiB |     14%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                                                         
+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|    0   N/A  N/A      8260    C+G   ...CBS_cw5n1h2txyewy\TextInputHost.exe      N/A      |
|    0   N/A  N/A      9260    C+G   ...\OpenVPN Connect\OpenVPNConnect.exe      N/A      |
|    0   N/A  N/A      9400    C+G   ...cent\QQGuild\9.7.22-513\QQGuild.exe      N/A      |
|    0   N/A  N/A     12280    C+G   C:\Windows\explorer.exe                     N/A      |
|    0   N/A  N/A     12448    C+G   ...nt.CBS_cw5n1h2txyewy\SearchHost.exe      N/A      |
|    0   N/A  N/A     12460    C+G   ...2txyewy\StartMenuExperienceHost.exe      N/A      |
|    0   N/A  N/A     15936    C+G   ...ASUSACCI\ArmouryCrateKeyControl.exe      N/A      |
|    0   N/A  N/A     17880    C+G   ...am Files\Microsoft VS Code\Code.exe      N/A      |
+-----------------------------------------------------------------------------------------+

@cuda.jit
    def add_kernel(x, y, out):

from numba import cuda
device = cuda.get_current_device()
device.name

b'NVIDIA GeForce RTX 2070'

import random
import time

def add_vectors(vector1, vector2, result_cpu):
    for i in range(len(vector1)):
        result_cpu[i] = vector1[i] + vector2[i]


n = 10**8
vector_a = [random.uniform(1.0, 100) for _ in range(n)]
vector_b = [random.uniform(1.0, 100) for _ in range(n)]
result_cpu = [0.0 for _ in range(n)]
start_time = time.time()
add_vectors(vector_a, vector_b, result_cpu)
end_time = time.time()
print("Time taken for CPU computation: ", end_time - start_time)

Time taken for CPU computation:  6.654557228088379

from numba import cuda
import math

@cuda.jit
def vector_add_gpu(vector1, vector2, result_gpu):
    idx = cuda.grid(1) # 获取当前线程在一维网格中的全局唯一索引
    if idx < len(vector1):
        result_gpu[idx] = vector1[idx] + vector2[idx]

# 分配GPU内存
vector1_gpu = cuda.to_device(vector_a)
vector2_gpu = cuda.to_device(vector_b)
result_gpu = cuda.to_device(result_cpu)

# 选择 CUDA 网格和块大小
threads_per_block = 256
blocks_per_grid = math.ceil(n/threads_per_block)

# 测试 GPU 版本性能
start_time = time.time()
vector_add_gpu[blocks_per_grid, threads_per_block](vector1_gpu, vector2_gpu, result_gpu)
cuda.synchronize() #  同步函数，确保所有在 GPU 上的任务完成执行后再继续进行下一步操作
gpu_time = time.time() - start_time
result_cpu = result_gpu.copy_to_host()
print("Time taken for GPU computation:", gpu_time)
print(result_gpu[0:10])
print(result_cpu[0:10])

Time taken for GPU computation: 4.254750728607178
<numba.cuda.cudadrv.devicearray.DeviceNDArray object at 0x000002179B3AF010>
[153.76211748 121.02976443 144.9690432   69.96996359 120.96739116
 153.96127433 149.94592761 166.40153127 111.64821499  80.59941164]

# pip install opencv-python
import cv2
import numba
import time
import math

filename = 'pj_at_apple.jpg'
img = cv2.imread(filename)
rows, cols, channels = img.shape
print(rows, cols, channels)

3468 4624 3

#cpu function
def process_cpu(img):
    rows, cols, channels=img.shape
    for i in range(rows):
        for j in range(cols):
            for c in range(channels):
                color=img[i,j][c]*2.0+30
                if color>255:
                    img[i,j][c]=255
                elif color<0:
                    img[i,j][c]=0
                else:
                    img[i,j][c]=color

dst_cpu = img.copy()
start_cpu = time.time()
process_cpu(dst_cpu)
end_cpu = time.time()
time_cpu = (end_cpu-start_cpu)
print("CPU process time: "+str(time_cpu))

CPU process time: 92.6022481918335

#GPU function
@cuda.jit
def process_gpu(img,rows,cols,channels):
    tx, ty = cuda.grid(2)
    if tx<rows and ty<cols:                             
        for c in range(channels):
            color = img[tx,ty][c]*2.0+30
            if color>255:
                img[tx,ty][c]=255
            elif color<0:
                img[tx,ty][c]=0
            else:
                img[tx,ty][c]=color

dst_gpu = img.copy()
rows, cols, channels = dst_gpu.shape

##GPU function
threadsperblock = (16,16)
blockspergrid_x = math.ceil(rows/threadsperblock[0])
blockspergrid_y = math.ceil(cols/threadsperblock[1])
blockspergrid = (blockspergrid_x,blockspergrid_y)
start_gpu = time.time()
dImg = cuda.to_device(img)
process_gpu[blockspergrid,threadsperblock](dImg,rows,cols,channels)
cuda.synchronize()
end_gpu = time.time()
dst_gpu = dImg.copy_to_host()
time_gpu = (end_gpu-start_gpu)
print("GPU process time: "+str(time_gpu))

GPU process time: 0.22005558013916016

import numpy as np
def matmul_cpu(A,B,C):
    for y in range(B.shape[1]):
        for x in range(A.shape[0]):
            tmp = 0
            for k in range(A.shape[1]):
                tmp += A[x,k]*B[k,y]
            C[x,y] = tmp

TPB = 16 # Threads per block
A = np.full((TPB*10,TPB*10), 3.0)
B = np.full((TPB*10,TPB*10), 4.0)
C_cpu = np.full((A.shape[0],B.shape[1]), 0.0)

#Start in CPU
print("Start processing in CPU")
start_cpu = time.time()
matmul_cpu(A,B,C_cpu)
end_cpu = time.time()
time_cpu = (end_cpu - start_cpu)
print("CPU time: "+str(time_cpu))

Start processing in CPU
CPU time: 1.325270175933838

import warnings
warnings.filterwarnings('ignore')

@cuda.jit
def matmul_gpu(A,B,C):
    row, col = cuda.grid(2)
    if row < C.shape[0] and col < C.shape[1]:
        tmp = 0.
        for k in range(A.shape[1]):
            tmp += A[row,k] * B[k,col]
        C[row,col] = tmp

#Start in GPU
A_global_gpu = cuda.to_device(A)
B_global_gpu = cuda.to_device(B)
C_global_gpu = cuda.to_device(C_cpu) 

threadsperblock = (TPB, TPB)
blockspergrid_x = int(math.ceil(A.shape[0]/threadsperblock[0]))
blockspergrid_y = int(math.ceil(A.shape[1]/threadsperblock[1]))
blockspergrid = (blockspergrid_x, blockspergrid_y)

start_gpu = time.time()
matmul_gpu[blockspergrid, threadsperblock](A_global_gpu, B_global_gpu, C_global_gpu)
cuda.synchronize()
end_gpu = time.time()
time_gpu = (end_gpu - start_gpu)
print("GPU time:"+str(time_gpu))

GPU time:0.15148043632507324

import cv2
import numpy as np
import matplotlib.pyplot as plt
import time

def convolve2d(image, kernel):
    m, n = image.shape
    km, kn = kernel.shape
    output = np.zeros((m - km + 1, n - kn + 1))
    for i in range(output.shape[0]):
        for j in range(output.shape[1]):
            output[i, j] = np.sum(image[i:i + km, j:j + kn] * kernel)
    return output

img = cv2.imread('conv.png')
image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) 

kernel = np.array([[1, 0, -1],
                   [1, 0, -1],
                   [1, 0, -1]])
start_time = time.time()
output_image = convolve2d(image, kernel)
end_time = time.time()
print("Time taken for convolution: ", end_time - start_time)

Time taken for convolution:  0.9632997512817383

# 显示原始和处理后的图像
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.title("Original Image")
plt.imshow(image, cmap='gray')
plt.axis('off')

plt.subplot(1, 2, 2)
plt.title("Processed Image")
plt.imshow(output_image, cmap='gray')
plt.axis('off')

plt.show()

from numba import cuda
@cuda.jit
def convolve_kernel(input_image, output_image, kernel, kernel_size):
    x, y = cuda.grid(2)
    k = kernel_size
    # 确保线程位置在图像内
    if x < input_image.shape[0] and y < input_image.shape[1]:
        conv_sum = 0.0        
        for i in range(0, k):
            for j in range(0, k):
                conv_sum += input_image[x + i, y + j] * kernel[i, j]
        output_image[x, y] = conv_sum

import math
image_path = 'conv.png'
input_image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

kernel = np.array([[1, 0, -1],
                   [1, 0, -1],
                   [1, 0, -1]])

kernel_size = kernel.shape[0] # 获得维度以便 GPU 使用

input_image_device = cuda.to_device(input_image)
kernel_device = cuda.to_device(kernel)

output_image = np.zeros((input_image.shape[0] - kernel.shape[0] + 1, input_image.shape[1] - kernel.shape[1] + 1))
output_image_device = cuda.to_device(output_image)

# 网格和块大小
threads_per_block = (16, 16)
blocks_per_grid_x = math.ceil(input_image.shape[0] / threads_per_block[0])
blocks_per_grid_y = math.ceil(input_image.shape[1] / threads_per_block[1])
blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)

start_time = time.time()
convolve_kernel[blocks_per_grid, threads_per_block](input_image_device, output_image_device, kernel_device, kernel_size)
cuda.synchronize()
end_time = time.time()

print("Time taken on GPU:", end_time - start_time, "seconds")

Time taken on GPU: 0.8308184146881104 seconds

# 将结果从设备拷贝回主机
output_image_device.copy_to_host(output_image)

# 显示原始图像和卷积处理后的图像
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.title("Original Image")
plt.imshow(input_image, cmap='gray')
plt.axis('off')

plt.subplot(1, 2, 2)
plt.title("Processed Image")
plt.imshow(output_image, cmap='gray')
plt.axis('off')

plt.show()

Python数据处理¶

12. Python CUDA计算¶

什么是CUDA？¶

CPU和GPU对比¶

CPU和GPU的对比¶

CPU和GPU对比¶

适用设备:¶

CUDA相关命令¶

GPU的硬件结构¶

Device¶

SM¶

CUDA的线程层次¶

numba介绍¶

numba介绍¶

CUDA线程索引¶

实际编程¶

思考： numpy实现向量相加的效率？¶

GPU处理图像¶

矩阵相乘¶

卷积操作¶