Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Cannot create queue pair with ib_create_qp

I am writing an RDMA (InfiniBand) kernel module.

Up to now I have been successful creating the protection domain, completion queues for send and receive queues.

But whenever I try to create the Queue Pair by invoking ib_create_qp, it is failing to create the queue pair. The code which I'm wrote is shown below:

#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/err.h>
#include "myClient.h"


struct workqueue_struct *myClient_workqueue;
struct ib_sa_client myClient_sa_client;
/*
static void myClient_add_one(struct ib_device *device);
static void myClient_remove_one(struct ib_device *device);
*/

struct ib_pd *mypd;
struct ib_cq *myrcvcq;
struct ib_cq *myClientsendcq;
struct ib_qp *myClientqp;

void myClient_ib_recvcompletion(struct ib_cq *cq)
{
    printk("A user-specified callback that is invoked when a completion event occurs on the CQ.\n");
}


void myClient_ib_sendcompletion(struct ib_cq *cq)
{
        printk("A user-specified callback that is invoked when a completion event occurs on the CQ.\n");
}
static void my_qp_event_handler(struct ib_event *myqpAsyncEvent, void *anyPointer)
{
        printk(KERN_INFO "Dummy affiliated asynchronous event occured function called \n");
}


static void myClient_add_one(struct ib_device *device)
{
    union ib_gid tmp_gid;
    int ret;
    int hcaport = 1;
    int result = -ENOMEM;
    u16 port1Pkey;
    struct ib_port_attr attr;

        ret = ib_query_port(device,hcaport,&attr);
        printk("ib query port result %d  \n", ret);

//  Creating the Protection Domain for RDMA
    mypd = ib_alloc_pd(device);

    if(IS_ERR(mypd)){
        printk(KERN_INFO "Failed to allocate PD\n");
        return;
    }
    else{
        printk(KERN_INFO "1Successfully allocated the PD\n");
        pdset = true;
    }

//  Creating the receive completion queue for RDMA
    myrcvcq = ib_create_cq(device,myClient_ib_recvcompletion,NULL,NULL,myClient_recvq_size,0);
        if(IS_ERR(myrcvcq)){
                pr_err("%s:%d error code for receive cq%d\n", __func__, __LINE__, PTR_ERR(myrcvcq));
                //printk("Error creating QP: %d \n",PTR_ERR(myClientqp));
        }
    else{
        printk("Recieve CQ successfully created in address: %x \n",myrcvcq);
    }

//  Creating the send completion queue for RDMA
    myClientsendcq = ib_create_cq(device,myClient_ib_sendcompletion, NULL, NULL,myClient_sendq_size,0 );
        if(IS_ERR(myClientsendcq)){
                pr_err("%s:%d scqerror code for send cq%d\n", __func__, __LINE__, PTR_ERR(myClientsendcq));
                //printk("Error creating QP: %d \n",PTR_ERR(myClientqp));
        }
        else{
                printk("1Send CQ successfully created in address: %x \n",myClientsendcq);
        }

//  Creating the queue pair
//      Creating the queue pair

        struct ib_qp_init_attr init_qpattr;

        memset(&init_qpattr,0,sizeof(init_qpattr));
        init_qpattr.event_handler = myClient_qp_event_handler;
        init_qpattr.cap.max_send_wr = 2;
        init_qpattr.cap.max_recv_wr = 2;
        init_qpattr.cap.max_recv_sge = 1;
        init_qpattr.cap.max_send_sge = 1;
        init_qpattr.sq_sig_type = IB_SIGNAL_ALL_WR;
        init_qpattr.qp_type = IB_QPT_UD;
        init_qpattr.send_cq = myClientsendcq;
        init_qpattr.recv_cq = myrcvcq;

        myClientqp = ib_create_qp(mypd,&init_qpattr);

        if(IS_ERR(myClientqp)){
                pr_err("%s:%d error code %d\n", __func__, __LINE__, PTR_ERR(myClientqp));
                //printk("Error creating QP: %d \n",PTR_ERR(myClientqp));
        }
        else{
                printk(KERN_INFO "1The queue pair is successfully created \n");
                qpcreated = true;
        }



}
static void myClient_remove_one(struct ib_device *device)
{
}

static struct ib_client my_client = {
        .name   = "myRDMAclient",
        .add    = myClient_add_one,
        .remove = myClient_remove_one
};


static int __init myRDMAclient_init(void)
{
    int ret;

    ret = ib_register_client(&my_client);
    if(ret){
        //printk(KERN_ALERT "KERN_ERR Failed to register IB client\n");
        goto err_sa;
    }
    printk(KERN_ALERT "lKERN_INFO Successfully registered myRDMAclient module \n");
    return 0;

err_sa:


    return ret;
}


module_init(myRDMAclient_init);

Here all the queries works except the ib_create_qp(mypd,&init_qpattr); which fails to create the queue pair.

Updated: Registered the memory before creating Queue Pair. But still it is showing invalid argument error (error code -22) for ib_create_qp

#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/err.h>
#include "myClient.h"


struct workqueue_struct *myClient_workqueue;
struct ib_sa_client myClient_sa_client;
/*
static void myClient_add_one(struct ib_device *device);
static void myClient_remove_one(struct ib_device *device);
*/

struct ib_pd *mypd;
struct ib_cq *myrcvcq;
struct ib_cq *myClientsendcq;
struct ib_qp *myClientqp;
struct ib_mr *mymr;

void myClient_ib_recvcompletion(struct ib_cq *cq)
{
    printk("A user-specified callback that is invoked when a completion event occurs on the CQ.\n");
}


void myClient_ib_sendcompletion(struct ib_cq *cq)
{
        printk("A user-specified callback that is invoked when a completion event occurs on the CQ.\n");
}
static void my_qp_event_handler(struct ib_event *myqpAsyncEvent, void *anyPointer)
{
        printk(KERN_INFO "Dummy affiliated asynchronous event occured function called \n");
}


static void myClient_add_one(struct ib_device *device)
{
    union ib_gid tmp_gid;
    int ret;
    int hcaport = 1;
    int result = -ENOMEM;
    u16 port1Pkey;
    struct ib_port_attr attr;

        ret = ib_query_port(device,hcaport,&attr);
        printk("ib query port result %d  \n", ret);

//  Creating the Protection Domain for RDMA
    mypd = ib_alloc_pd(device);

    if(IS_ERR(mypd)){
        printk(KERN_INFO "Failed to allocate PD\n");
        return;
    }
    else{
        printk(KERN_INFO "1Successfully allocated the PD\n");
        pdset = true;
    }
// Registering Memory
    mymr = ib_get_dma_mr(mypd,IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ| IB_ACCESS_REMOTE_WRITE);
    if(IS_ERR(mymr)){
            printk("failed to register memory :( %d \n",PTR_ERR(mymr));
    }else{
            printk(KERN_INFO "Successfully registered memory region :) \n");
    }
// End Registering Memory
//  Creating the receive completion queue for RDMA
    myrcvcq = ib_create_cq(device,myClient_ib_recvcompletion,NULL,NULL,myClient_recvq_size,0);
        if(IS_ERR(myrcvcq)){
                pr_err("%s:%d error code for receive cq%d\n", __func__, __LINE__, PTR_ERR(myrcvcq));
                //printk("Error creating QP: %d \n",PTR_ERR(myClientqp));
        }
    else{
        printk("Recieve CQ successfully created in address: %x \n",myrcvcq);
    }

//  Creating the send completion queue for RDMA
    myClientsendcq = ib_create_cq(device,myClient_ib_sendcompletion, NULL, NULL,myClient_sendq_size,0 );
        if(IS_ERR(myClientsendcq)){
                pr_err("%s:%d scqerror code for send cq%d\n", __func__, __LINE__, PTR_ERR(myClientsendcq));
                //printk("Error creating QP: %d \n",PTR_ERR(myClientqp));
        }
        else{
                printk("1Send CQ successfully created in address: %x \n",myClientsendcq);
        }

//  Creating the queue pair
//      Creating the queue pair

        struct ib_qp_init_attr init_qpattr;

        memset(&init_qpattr,0,sizeof(init_qpattr));
        init_qpattr.event_handler = myClient_qp_event_handler;
        init_qpattr.cap.max_send_wr = 2;
        init_qpattr.cap.max_recv_wr = 2;
        init_qpattr.cap.max_recv_sge = 1;
        init_qpattr.cap.max_send_sge = 1;
        init_qpattr.sq_sig_type = IB_SIGNAL_ALL_WR;
        init_qpattr.qp_type = IB_QPT_UD;
        init_qpattr.send_cq = myClientsendcq;
        init_qpattr.recv_cq = myrcvcq;

        myClientqp = ib_create_qp(mypd,&init_qpattr);

        if(IS_ERR(myClientqp)){
                pr_err("%s:%d error code %d\n", __func__, __LINE__, PTR_ERR(myClientqp));
                //printk("Error creating QP: %d \n",PTR_ERR(myClientqp));
        }
        else{
                printk(KERN_INFO "1The queue pair is successfully created \n");
                qpcreated = true;
        }



}
static void myClient_remove_one(struct ib_device *device)
{
}

static struct ib_client my_client = {
        .name   = "myRDMAclient",
        .add    = myClient_add_one,
        .remove = myClient_remove_one
};


static int __init myRDMAclient_init(void)
{
    int ret;

    ret = ib_register_client(&my_client);
    if(ret){
        //printk(KERN_ALERT "KERN_ERR Failed to register IB client\n");
        goto err_sa;
    }
    printk(KERN_ALERT "lKERN_INFO Successfully registered myRDMAclient module \n");
    return 0;

err_sa:


    return ret;
}


module_init(myRDMAclient_init);
like image 338
user3243499 Avatar asked Jan 14 '16 11:01

user3243499


1 Answers

UPDATE:

Based on the discussion in the comments below, I'm guessing you installed Mellanox OFED drivers on top of your current distribution. Looking at the 3.1-1.0.3 source of Mellanox OFED kernel drivers, I see that they changed the layout of struct ib_qp_init_attr by adding some fields. I'm pretty sure that your problem is that you're building your module against the original SLE 3.0.76-0.11 kernel headers, so the init_qpattr structure your passing to the create QP function does not have the values you set up in the right places.

I don't know how you've installed the new out-of-tree drivers, so I can't tell you exactly how to build your module properly, but you could try adding something like

    init_qpattr.qpg_type = 0;

to where you set up the struct. (I know you memset the whole thing to zero already, but this will make sure that the headers you're building against have the new qpg_type member for the structure. I think that's a new field added by OFED that isn't in your original kernel headers, so if your module compiles, then you're building against the right headers)

OLD ANSWER:

So I suspect that you are running into a bug in the mlx4 driver related to creating such a small QP (max_send_wr == max_recv_wr == 2 and max_send_sge == max_recv_sge == 1). I managed to find the source for the 3.0.76-0.11 kernel you're using, and I don't see any obvious bug, unfortunately.

Some things you could try to help debug this

  1. Add a module parameter debug_level=1 to the mlx4_core module when loading it. Update your question with all the output from driver initialization (a bunch of lines about "Max CQEs:" etc. There is a fair amount of logic in the mlx4 driver that depend on the parameters returned by fimrware during initialization, and this output would let us see what those are.
  2. For that matter, it's worth checking if your HCA firmware is up-to-date — you may get better results with newer firmware (although the driver should work anyway, you might be hitting a bug in untested driver code because of a missing firmware feature that triggers a different code path).
  3. Try updating your code to increase those parameters. You could try increasing the max_send_sge and max_recv_sge to 2 and increase max_send_wr and max_recv_wr to, say, 32 or 128. (Try increasing those indvidually or in combination)
  4. If you know how to enable the function tracer (This LWN article is helpful; I'm assuming the old SLES kernel has all the required features), then enabling tracing for the mlx4_ib and mlx4_core modules and then loading your module would be great. If you update your question with the trace, then we can look at where the create QP operation is failing — for example, is it failing in set_rq_size(), getting to set_kernel_sq_size() or failing somewhere else?
like image 177
Roland Avatar answered Oct 06 '22 00:10

Roland