get "LogicError: explicit_context_dependent failed: invalid device context - no currently active context? " when running tensorRT in ROS

I have an inference code in TensorRT(with python). I want to run this code in ROS but I get the below error when trying to allocate buffer:

LogicError: explicit_context_dependent failed: invalid device context - no currently active context?

The code works well out of the ROS package. A ROS node publishes an image and the given code get the image to do inference. The inference code is shown below:

#!/usr/bin/env python
# Revision $Id$

import rospy
from std_msgs.msg import String
from cv_bridge import CvBridge
import cv2
import os
import numpy as np
import argparse
import torch
from torch.autograd import Variable
from torchvision import transforms
import torch.nn.functional as F
import torch._utils
from PIL import Image
from sensor_msgs.msg import Image as ImageMsg
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import random
import sys
import common
import shutil
from itertools import chain

TRT_LOGGER = trt.Logger()
# cuda.init()

class ModelData(object):

    def __init__(self):

        self.MODEL_PATH = "./MobileNet_v2_Final.onnx" ## converted model from pytorch to onnx

        self.batch_size = 1
        self.num_classes = 3

        self.engine = build_int8_engine(self.MODEL_PATH, self.batch_size)
        self.context = self.engine.create_execution_context()

        ### ROS PART
        self.bridge_ROS = CvBridge()
        self.loop_rate = rospy.Rate(1)
        self.pub = rospy.Publisher('Image_Label', String, queue_size=1)

        print('INIT Successfully')

    def callback(self, msg):
        rospy.loginfo('Image received...')

        cv_image = self.bridge_ROS.imgmsg_to_cv2(msg, desired_encoding="passthrough")

        inputs, outputs, bindings, stream = common.allocate_buffers(context.engine)
        [output] = common.do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=effective_batch_size)

    def listener(self):

        rospy.Subscriber("chatter", ImageMsg, self.callback)

        while not rospy.is_shutdown():
            rospy.loginfo('Getting image...')

def build_int8_engine(model_file, batch_size=32):
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
        builder.max_batch_size = batch_size
        builder.max_workspace_size = common.GiB(1)

        with open(model_file, 'rb') as model:

        return builder.build_cuda_engine(network)

if __name__ == '__main__':
    rospy.init_node("listener", anonymous=True)
    infer = ModelData()

The error comes from the below class in stream = cuda.Stream():

#!/usr/bin/env python
# Revision $Id$

from itertools import chain
import argparse
import os
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import tensorrt as trt

# Simple helper data class that's a little nicer to use than a 2-tuple.
class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
            outputs.append(HostDeviceMem(host_mem, device_mem))
    del ctx
    return inputs, outputs, bindings, stream

# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, batch_size=1):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # [cuda.memcpy_htod(inp.device, inp.host) for inp in inputs]
    # Run inference.
    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    # context.execute(batch_size=batch_size, bindings=bindings)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # [cuda.memcpy_dtoh(out.host, out.device) for out in outputs]
    # Synchronize the stream
    # Return only the host outputs.

    return [out.host for out in outputs]

More info:

TensorRT: 6.1.5
Python: 2.7
rosversion: 1.14.3
rosdistro: melodic

You need to explicitly create Cuda Device and load Cuda Context in the worker thread i.e. your callback function, instead of using import pycuda.autoinit in the main thread, as follows

import pycuda.driver as cuda
import threading

def callback():
    device = cuda.Device(0)  # enter your Gpu id here
    ctx = device.make_context()

    allocate_buffers()  # load Cuda buffers or any other Cuda or TenosrRT operations

    ctx.pop()  # very important

if __name__ == "__main__":
    worker_thread = threading.Thread(target=callback())

Note: do not forget to remove import pycuda.autoinit in both modules

This is also discussed in a question here

