I am writing an RDMA (InfiniBand) kernel module.
Up to now I have been successful creating the protection domain, completion queues for send and receive queues.
But whenever I try to create the Queue Pair by invoking ib_create_qp, it is failing to create the queue pair. The code which I'm wrote is shown below:
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/err.h>
#include "myClient.h"
struct workqueue_struct *myClient_workqueue;
struct ib_sa_client myClient_sa_client;
/*
static void myClient_add_one(struct ib_device *device);
static void myClient_remove_one(struct ib_device *device);
*/
struct ib_pd *mypd;
struct ib_cq *myrcvcq;
struct ib_cq *myClientsendcq;
struct ib_qp *myClientqp;
void myClient_ib_recvcompletion(struct ib_cq *cq)
{
printk("A user-specified callback that is invoked when a completion event occurs on the CQ.\n");
}
void myClient_ib_sendcompletion(struct ib_cq *cq)
{
printk("A user-specified callback that is invoked when a completion event occurs on the CQ.\n");
}
static void my_qp_event_handler(struct ib_event *myqpAsyncEvent, void *anyPointer)
{
printk(KERN_INFO "Dummy affiliated asynchronous event occured function called \n");
}
static void myClient_add_one(struct ib_device *device)
{
union ib_gid tmp_gid;
int ret;
int hcaport = 1;
int result = -ENOMEM;
u16 port1Pkey;
struct ib_port_attr attr;
ret = ib_query_port(device,hcaport,&attr);
printk("ib query port result %d \n", ret);
// Creating the Protection Domain for RDMA
mypd = ib_alloc_pd(device);
if(IS_ERR(mypd)){
printk(KERN_INFO "Failed to allocate PD\n");
return;
}
else{
printk(KERN_INFO "1Successfully allocated the PD\n");
pdset = true;
}
// Creating the receive completion queue for RDMA
myrcvcq = ib_create_cq(device,myClient_ib_recvcompletion,NULL,NULL,myClient_recvq_size,0);
if(IS_ERR(myrcvcq)){
pr_err("%s:%d error code for receive cq%d\n", __func__, __LINE__, PTR_ERR(myrcvcq));
//printk("Error creating QP: %d \n",PTR_ERR(myClientqp));
}
else{
printk("Recieve CQ successfully created in address: %x \n",myrcvcq);
}
// Creating the send completion queue for RDMA
myClientsendcq = ib_create_cq(device,myClient_ib_sendcompletion, NULL, NULL,myClient_sendq_size,0 );
if(IS_ERR(myClientsendcq)){
pr_err("%s:%d scqerror code for send cq%d\n", __func__, __LINE__, PTR_ERR(myClientsendcq));
//printk("Error creating QP: %d \n",PTR_ERR(myClientqp));
}
else{
printk("1Send CQ successfully created in address: %x \n",myClientsendcq);
}
// Creating the queue pair
// Creating the queue pair
struct ib_qp_init_attr init_qpattr;
memset(&init_qpattr,0,sizeof(init_qpattr));
init_qpattr.event_handler = myClient_qp_event_handler;
init_qpattr.cap.max_send_wr = 2;
init_qpattr.cap.max_recv_wr = 2;
init_qpattr.cap.max_recv_sge = 1;
init_qpattr.cap.max_send_sge = 1;
init_qpattr.sq_sig_type = IB_SIGNAL_ALL_WR;
init_qpattr.qp_type = IB_QPT_UD;
init_qpattr.send_cq = myClientsendcq;
init_qpattr.recv_cq = myrcvcq;
myClientqp = ib_create_qp(mypd,&init_qpattr);
if(IS_ERR(myClientqp)){
pr_err("%s:%d error code %d\n", __func__, __LINE__, PTR_ERR(myClientqp));
//printk("Error creating QP: %d \n",PTR_ERR(myClientqp));
}
else{
printk(KERN_INFO "1The queue pair is successfully created \n");
qpcreated = true;
}
}
static void myClient_remove_one(struct ib_device *device)
{
}
static struct ib_client my_client = {
.name = "myRDMAclient",
.add = myClient_add_one,
.remove = myClient_remove_one
};
static int __init myRDMAclient_init(void)
{
int ret;
ret = ib_register_client(&my_client);
if(ret){
//printk(KERN_ALERT "KERN_ERR Failed to register IB client\n");
goto err_sa;
}
printk(KERN_ALERT "lKERN_INFO Successfully registered myRDMAclient module \n");
return 0;
err_sa:
return ret;
}
module_init(myRDMAclient_init);
Here all the queries works except the ib_create_qp(mypd,&init_qpattr);
which fails to create the queue pair.
Updated: Registered the memory before creating Queue Pair. But still it is showing invalid argument error (error code -22) for ib_create_qp
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/err.h>
#include "myClient.h"
struct workqueue_struct *myClient_workqueue;
struct ib_sa_client myClient_sa_client;
/*
static void myClient_add_one(struct ib_device *device);
static void myClient_remove_one(struct ib_device *device);
*/
struct ib_pd *mypd;
struct ib_cq *myrcvcq;
struct ib_cq *myClientsendcq;
struct ib_qp *myClientqp;
struct ib_mr *mymr;
void myClient_ib_recvcompletion(struct ib_cq *cq)
{
printk("A user-specified callback that is invoked when a completion event occurs on the CQ.\n");
}
void myClient_ib_sendcompletion(struct ib_cq *cq)
{
printk("A user-specified callback that is invoked when a completion event occurs on the CQ.\n");
}
static void my_qp_event_handler(struct ib_event *myqpAsyncEvent, void *anyPointer)
{
printk(KERN_INFO "Dummy affiliated asynchronous event occured function called \n");
}
static void myClient_add_one(struct ib_device *device)
{
union ib_gid tmp_gid;
int ret;
int hcaport = 1;
int result = -ENOMEM;
u16 port1Pkey;
struct ib_port_attr attr;
ret = ib_query_port(device,hcaport,&attr);
printk("ib query port result %d \n", ret);
// Creating the Protection Domain for RDMA
mypd = ib_alloc_pd(device);
if(IS_ERR(mypd)){
printk(KERN_INFO "Failed to allocate PD\n");
return;
}
else{
printk(KERN_INFO "1Successfully allocated the PD\n");
pdset = true;
}
// Registering Memory
mymr = ib_get_dma_mr(mypd,IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ| IB_ACCESS_REMOTE_WRITE);
if(IS_ERR(mymr)){
printk("failed to register memory :( %d \n",PTR_ERR(mymr));
}else{
printk(KERN_INFO "Successfully registered memory region :) \n");
}
// End Registering Memory
// Creating the receive completion queue for RDMA
myrcvcq = ib_create_cq(device,myClient_ib_recvcompletion,NULL,NULL,myClient_recvq_size,0);
if(IS_ERR(myrcvcq)){
pr_err("%s:%d error code for receive cq%d\n", __func__, __LINE__, PTR_ERR(myrcvcq));
//printk("Error creating QP: %d \n",PTR_ERR(myClientqp));
}
else{
printk("Recieve CQ successfully created in address: %x \n",myrcvcq);
}
// Creating the send completion queue for RDMA
myClientsendcq = ib_create_cq(device,myClient_ib_sendcompletion, NULL, NULL,myClient_sendq_size,0 );
if(IS_ERR(myClientsendcq)){
pr_err("%s:%d scqerror code for send cq%d\n", __func__, __LINE__, PTR_ERR(myClientsendcq));
//printk("Error creating QP: %d \n",PTR_ERR(myClientqp));
}
else{
printk("1Send CQ successfully created in address: %x \n",myClientsendcq);
}
// Creating the queue pair
// Creating the queue pair
struct ib_qp_init_attr init_qpattr;
memset(&init_qpattr,0,sizeof(init_qpattr));
init_qpattr.event_handler = myClient_qp_event_handler;
init_qpattr.cap.max_send_wr = 2;
init_qpattr.cap.max_recv_wr = 2;
init_qpattr.cap.max_recv_sge = 1;
init_qpattr.cap.max_send_sge = 1;
init_qpattr.sq_sig_type = IB_SIGNAL_ALL_WR;
init_qpattr.qp_type = IB_QPT_UD;
init_qpattr.send_cq = myClientsendcq;
init_qpattr.recv_cq = myrcvcq;
myClientqp = ib_create_qp(mypd,&init_qpattr);
if(IS_ERR(myClientqp)){
pr_err("%s:%d error code %d\n", __func__, __LINE__, PTR_ERR(myClientqp));
//printk("Error creating QP: %d \n",PTR_ERR(myClientqp));
}
else{
printk(KERN_INFO "1The queue pair is successfully created \n");
qpcreated = true;
}
}
static void myClient_remove_one(struct ib_device *device)
{
}
static struct ib_client my_client = {
.name = "myRDMAclient",
.add = myClient_add_one,
.remove = myClient_remove_one
};
static int __init myRDMAclient_init(void)
{
int ret;
ret = ib_register_client(&my_client);
if(ret){
//printk(KERN_ALERT "KERN_ERR Failed to register IB client\n");
goto err_sa;
}
printk(KERN_ALERT "lKERN_INFO Successfully registered myRDMAclient module \n");
return 0;
err_sa:
return ret;
}
module_init(myRDMAclient_init);
UPDATE:
Based on the discussion in the comments below, I'm guessing you installed Mellanox OFED drivers on top of your current distribution. Looking at the 3.1-1.0.3 source of Mellanox OFED kernel drivers, I see that they changed the layout of struct ib_qp_init_attr
by adding some fields. I'm pretty sure that your problem is that you're building your module against the original SLE 3.0.76-0.11 kernel headers, so the init_qpattr
structure your passing to the create QP function does not have the values you set up in the right places.
I don't know how you've installed the new out-of-tree drivers, so I can't tell you exactly how to build your module properly, but you could try adding something like
init_qpattr.qpg_type = 0;
to where you set up the struct. (I know you memset
the whole thing to zero already, but this will make sure that the headers you're building against have the new qpg_type
member for the structure. I think that's a new field added by OFED that isn't in your original kernel headers, so if your module compiles, then you're building against the right headers)
OLD ANSWER:
So I suspect that you are running into a bug in the mlx4 driver related to creating such a small QP (max_send_wr == max_recv_wr == 2
and max_send_sge == max_recv_sge == 1
). I managed to find the source for the 3.0.76-0.11 kernel you're using, and I don't see any obvious bug, unfortunately.
Some things you could try to help debug this
debug_level=1
to the mlx4_core
module when loading it. Update your question with all the output from driver initialization (a bunch of lines about "Max CQEs:" etc. There is a fair amount of logic in the mlx4 driver that depend on the parameters returned by fimrware during initialization, and this output would let us see what those are.max_send_sge
and max_recv_sge
to 2 and increase max_send_wr
and max_recv_wr
to, say, 32 or 128. (Try increasing those indvidually or in combination)set_rq_size()
, getting to set_kernel_sq_size()
or failing somewhere else?If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With