I have a problem where a need to calculate cosine similarities
between a numpy array of shape (1, 300) and a matrix of shape (5000000, 300). I have tried multiple different flavors of codes and now I am wondering if there is a way to reduce the run time substantially :
Version 1 : I divide my big matrix into 5 smaller matrix of size 1Mil each:
from scipy import spatial
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
def cos_matrix_multiplication(vector,matrix_1):
v = vector.reshape(1, -1)
scores1=spatial.distance.cdist(matrix_1, v, 'cosine')
return((scores1[:1]))
pool = ThreadPoolExecutor(8)
URLS=[mat_small1,mat_small2,mat_small3,mat_small4,mat_small5]
neighbors=[]
with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor:
# Start the load operations and mark each future with its URL
future_to_url = {executor.submit(cos_matrix_multiplication,vec,mat_col): mat_col for mat_col in URLS}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
data = future.result()
neighbors.append(data)
Runtime : 2.48secs
Version 2: Using Numba jit : inspired by this SO answer
@numba.jit('void(f4, f4)',nogil=True)
def cosine_sim(A,B):
scores = np.zeros(A.shape[0])
for i in range(A.shape[0]):
v = A[i]
m = B.shape[1]
udotv = 0
u_norm = 0
v_norm = 0
for j in range(m):
udotv += B[0][j] * v[j]
u_norm += B[0][j] * B[0][j]
v_norm += v[j] * v[j]
ratio = udotv/((u_norm*v_norm)**0.5)
scores[i] = ratio
i += 1
return scores
cosine_sim(matrix,vec)
Runtime 2.34 secs
Version 3: Using Cuda jit (can't really get to work in a reproducible manner each time)
@cuda.jit
def cosine_sim(A,B,C):
#scores = np.zeros(A.shape[0])
for i in range(A.shape[0]):
v = A[i]
m = B.shape[1]
udotv = 0
u_norm = 0
v_norm = 0
for j in range(m):
udotv += B[0][j] * v[j]
u_norm += B[0][j] * B[0][j]
v_norm += v[j] * v[j]
u_norm = math.sqrt(u_norm)
v_norm = math.sqrt(v_norm)
if (u_norm == 0) or (v_norm == 0):
ratio = 1.0
else:
ratio = udotv / (u_norm * v_norm)
C[i,1] = ratio
i += 1
matrix = mat_small1
A_global_mem = cuda.to_device(matrix)
B_global_mem = cuda.to_device(vec)
C_global_mem = cuda.device_array((matrix.shape[0], 1))
threadsperblock = (16, 16)
blockspergrid_x = int(math.ceil(A_global_mem.shape[0] / threadsperblock[0]))
blockspergrid_y = int(math.ceil(B_global_mem.shape[1] / threadsperblock[1]))
blockspergrid = (blockspergrid_x, blockspergrid_y)
cosine_sim[blockspergrid, threadsperblock](A_global_mem, B_global_mem, C_global_mem)
C = C_global_mem.copy_to_host()
results in :
CudaAPIError: [702] Call to cuMemcpyDtoH results in CUDA_ERROR_LAUNCH_TIMEOUT
The matrix is dense, and the My GPU is 8gb ram, and total size of the matrix is about 4.7gb. Can a GPU expedite this at all?
Please try replacing ThreadPoolExecutor with ProcessPoolExecutor (you already have it declared). Former one is for async calls not for CPU bound tasks, although this is not specified in the docs directly.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With