Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

How to best write out a std::vector < std::string > container to a HDF5 dataset?

Tags:

c++

hdf5

stl

Given a vector of strings, what is the best way to write them out to a HDF5 dataset? At the moment I'm doing something like the following:

  const unsigned int MaxStrLength = 512;

  struct TempContainer {
    char string[MaxStrLength];
  };

  void writeVector (hid_t group, std::vector<std::string> const & v)
  {
    //
    // Firstly copy the contents of the vector into a temporary container
    std::vector<TempContainer> tc;
    for (std::vector<std::string>::const_iterator i = v.begin ()
                                              , end = v.end ()
      ; i != end
      ; ++i)
    {
      TempContainer t;
      strncpy (t.string, i->c_str (), MaxStrLength);
      tc.push_back (t);
    }


    //
    // Write the temporary container to a dataset
    hsize_t     dims[] = { tc.size () } ;
    hid_t dataspace = H5Screate_simple(sizeof(dims)/sizeof(*dims)
                               , dims
                               , NULL);

    hid_t strtype = H5Tcopy (H5T_C_S1);
    H5Tset_size (strtype, MaxStrLength);

    hid_t datatype = H5Tcreate (H5T_COMPOUND, sizeof (TempConainer));
    H5Tinsert (datatype
      , "string"
      , HOFFSET(TempContainer, string)
      , strtype);

    hid_t dataset = H5Dcreate1 (group
                          , "files"
                          , datatype
                          , dataspace
                          , H5P_DEFAULT);

    H5Dwrite (dataset, datatype, H5S_ALL, H5S_ALL, H5P_DEFAULT, &tc[0] );

    H5Dclose (dataset);
    H5Sclose (dataspace);
    H5Tclose (strtype);
    H5Tclose (datatype);
}

At a minimum, I would really like to change the above so that:

  1. It uses variable length strings
  2. I don't need to have a temporary container

I have no restrictions over how I store the data so for example, it doesn't have to be a COMPOUND datatype if there is a better way to do this.

EDIT: Just to narrow the problem down, I'm relatively familiar with playing with the data on the C++ side, it's the HDF5 side where I need most of the help.

Thanks for your help.

like image 734
Richard Corden Avatar asked Feb 24 '09 10:02

Richard Corden


4 Answers

[Many thanks to dirkgently for his help in answering this.]

To write a variable length string in HDF5 use the following:

// Create the datatype as follows
hid_t datatype = H5Tcopy (H5T_C_S1);
H5Tset_size (datatype, H5T_VARIABLE);

// 
// Pass the string to be written to H5Dwrite
// using the address of the pointer!
const char * s = v.c_str ();
H5Dwrite (dataset
  , datatype
  , H5S_ALL
  , H5S_ALL
  , H5P_DEFAULT
  , &s );

One solution for writing a container is to write each element individually. This can be achieved using hyperslabs.

For example:

class WriteString
{
public:
  WriteString (hid_t dataset, hid_t datatype
      , hid_t dataspace, hid_t memspace)
    : m_dataset (dataset), m_datatype (datatype)
    , m_dataspace (dataspace), m_memspace (memspace)
    , m_pos () {}

private:
  hid_t m_dataset;
  hid_t m_datatype;
  hid_t m_dataspace;
  hid_t m_memspace;
  int m_pos;

//...

public:
  void operator ()(std::vector<std::string>::value_type const & v)
  {
    // Select the file position, 1 record at position 'pos'
    hsize_t count[] = { 1 } ;
    hsize_t offset[] = { m_pos++ } ;
    H5Sselect_hyperslab( m_dataspace
      , H5S_SELECT_SET
      , offset
      , NULL
      , count
      , NULL );

    const char * s = v.c_str ();
    H5Dwrite (m_dataset
      , m_datatype
      , m_memspace
      , m_dataspace
      , H5P_DEFAULT
      , &s );
    }    
};

// ...

void writeVector (hid_t group, std::vector<std::string> const & v)
{
  hsize_t     dims[] = { m_files.size ()  } ;
  hid_t dataspace = H5Screate_simple(sizeof(dims)/sizeof(*dims)
                                    , dims, NULL);

  dims[0] = 1;
  hid_t memspace = H5Screate_simple(sizeof(dims)/sizeof(*dims)
                                    , dims, NULL);

  hid_t datatype = H5Tcopy (H5T_C_S1);
  H5Tset_size (datatype, H5T_VARIABLE);

  hid_t dataset = H5Dcreate1 (group, "files", datatype
                             , dataspace, H5P_DEFAULT);

  // 
  // Select the "memory" to be written out - just 1 record.
  hsize_t offset[] = { 0 } ;
  hsize_t count[] = { 1 } ;
  H5Sselect_hyperslab( memspace, H5S_SELECT_SET, offset
                     , NULL, count, NULL );

  std::for_each (v.begin ()
      , v.end ()
      , WriteStrings (dataset, datatype, dataspace, memspace));

  H5Dclose (dataset);
  H5Sclose (dataspace);
  H5Sclose (memspace);
  H5Tclose (datatype);
}      
like image 77
Richard Corden Avatar answered Oct 17 '22 00:10

Richard Corden


Here is some working code for writing a vector of variable length strings using the HDF5 c++ API.

I incorporate some of the suggestions in the other posts:

  1. use H5T_C_S1 and H5T_VARIABLE
  2. use string::c_str() to obtain pointers to the strings
  3. place the pointers into a vector of char* and pass to the HDF5 API

It is not necessary to create expensive copies of the string (e.g. with strdup()). c_str() returns a pointer to the null terminated data of the underlying string. This is precisely what the function is intended for. Of course, strings with embedded nulls will not work with this...

std::vector is guaranteed to have contiguous underlying storage, so using vector and vector::data() is the same as using raw arrays but is of course much neater and safer than the clunky, old-fashioned c way of doing things.

#include "H5Cpp.h"
void write_hdf5(H5::H5File file, const std::string& data_set_name,
                const std::vector<std::string>& strings )
{
    H5::Exception::dontPrint();

    try
    {
        // HDF5 only understands vector of char* :-(
        std::vector<const char*> arr_c_str;
        for (unsigned ii = 0; ii < strings.size(); ++ii) 
            arr_c_str.push_back(strings[ii].c_str());

        //
        //  one dimension
        // 
        hsize_t     str_dimsf[1] {arr_c_str.size()};
        H5::DataSpace   dataspace(1, str_dimsf);

        // Variable length string
        H5::StrType datatype(H5::PredType::C_S1, H5T_VARIABLE); 
        H5::DataSet str_dataset = file.createDataSet(data_set_name, datatype, dataspace);

        str_dataset.write(arr_c_str.data(), datatype);
    }
    catch (H5::Exception& err)
    {
        throw std::runtime_error(string("HDF5 Error in " ) 
                                    + err.getFuncName()
                                    + ": "
                                    + err.getDetailMsg());


    }
}
like image 41
Leo Goodstadt Avatar answered Oct 17 '22 00:10

Leo Goodstadt


If you are looking at cleaner code: I suggest you create a functor that'll take a string and save it to the HDF5 Container (in a desired mode). Richard, I used the wrong algorithm, please re-check!

std::for_each(v.begin(), v.end(), write_hdf5);

struct hdf5 : public std::unary_function<std::string, void> {
    hdf5() : _dataset(...) {} // initialize the HDF5 db
    ~hdf5() : _dataset(...) {} // close the the HDF5 db
    void operator(std::string& s) {
            // append 
            // use s.c_str() ?
    }
};

Does that help get started?

like image 1
dirkgently Avatar answered Oct 17 '22 00:10

dirkgently


I had a similar issue, with the caveat that I wanted a vector of strings stored as an attribute. The tricky thing with attributes is that we can't use fancy dataspace features like hyperslabs (at least with the C++ API).

But in either case, it may be useful to enter a vector of strings into a single entry in a dataset (if, for example, you always expect to read them together). In this case all the magic comes with the type, not with the dataspace itself.

There are basically 4 steps:

  1. Make a vector<const char*> which points to the strings.
  2. Create a hvl_t structure that points to the vector and contains it's length.
  3. Create the datatype. This is a H5::VarLenType wrapping a (variable length) H5::StrType.
  4. Write the hvl_t type to a dataset.

The really nice part of this method is that you're stuffing the whole entry into what HDF5 considers a scalar value. This means that making it an attribute (rather than a dataset) is trivial.

Whether you choose this solution or the one with each string in its own dataset entry is probably also a matter of the desired performance: if you're looking for random access to specific strings, it's probably better to write the strings out in a dataset so they can be indexed. If you're always going to read them all out together this solution may work just as well.

Here's a short example of how to do this, using the C++ API and a simple scalar dataset:

#include <vector>
#include <string>
#include "H5Cpp.h"

int main(int argc, char* argv[]) {
  // Part 0: make up some data
  std::vector<std::string> strings;
  for (int iii = 0; iii < 10; iii++) {
    strings.push_back("this is " + std::to_string(iii));
  }

  // Part 1: grab pointers to the chars
  std::vector<const char*> chars;
  for (const auto& str: strings) {
    chars.push_back(str.data());
  }

  // Part 2: create the variable length type
  hvl_t hdf_buffer;
  hdf_buffer.p = chars.data();
  hdf_buffer.len = chars.size();

  // Part 3: create the type
  auto s_type = H5::StrType(H5::PredType::C_S1, H5T_VARIABLE);
  s_type.setCset(H5T_CSET_UTF8); // just for fun, you don't need this
  auto svec_type = H5::VarLenType(&s_type);

  // Part 4: write the output to a scalar dataset
  H5::H5File out_file("vtest.h5", H5F_ACC_EXCL);
  H5::DataSet dataset(
    out_file.createDataSet("the_ds", svec_type, H5S_SCALAR));
  dataset.write(&hdf_buffer, svec_type);

  return 0;
}
like image 1
Shep Avatar answered Oct 16 '22 22:10

Shep