I'm looking for a way to unpack binary data. The data is described by a whole tree of structs (up to four layers deep, total struct size is almost 64k) in a C header file.
For this question: Unpacking nested C structs in Python the only answer was to "flatten" the whole data description into one string, but I'm reluctant to do this in my case because it would be unmaintainable.
What I'd love to have is something that reads the header file and then magically creates a function that takes the binary data and returns a (properly nested) dictionary with all the data.
Is there any Python module that can accomplish this, or some parts of it?
Or are there other ways to deal with such data in Python?
I do not know how to reliably do the hard part of parsing a C header file, because you may even depend on declarations coming from other include files for example, you could use a struct stat
that will depend on sys/stat.h
and possibly on other standard include files. But I'll come back to that point later ...
So i propose you to do this part by hand, and to use a hierarchical description that will allow to rebuild hierachical structures from the flat list of fields returned from struct module.
Say we have a struct described in C language by :
struct S4 {
char c;
char d;
};
struct my_struct {
int a[4];
struct {
int b;
struct {
struct S4 z[2];
int e;
} y;
} x;
int f;
struct {
int g;
struct {
struct S4 v;
int j;
} u;
} t;
};
I first write by hand my initial structure description :
'''\
S4: { c: b, d: b, c: 2x}
{ a: 4h, x: { b: h, y: { z: 2 S4, e: h}}, f: h, t: { g: h, u: { v: S4, j: h}}}\
'''
that gives the names of fields and their type (in struct
module format)
Then a parser (using PLY) parses it and returns an object that can transform a binary struct into in namedtuple containing recursively the whole struct with correct names, all inner structs being also namedtuples (the struct
format string comes as a sub product)
- all that correctly manages explicit padding
- it is possible to use named structures (S4
in the example), they have to be declared before use
- arrays (of simple elements or named structs) are implemented by adding a number before the type ( a : 4 h
<=> short a[4];
or x : 2 S4
<=> struct S4 x[2];
in the example)
But it is not a single file light solution, it would need a good deal of comments and docs. An alternative could be to directly use pycparser
and original C header files, but it will be even more complex and I do not like very much the idea of importing standard C library headers. Cannot explain why, but I feel it less robust.
Here is an example of use :
>>> s = '''\
S4: { c: b, d: b, c: 2x}
{ a: 4h, x: { b: h, y: { z: 2 S4, e: h}}, f: h, t: { g: h, u: { v: S4, j: h}}}\
'''
>>> ns = Named_struct(s)
>>> import struct
>>> b = struct.pack(ns.fmt, * range(15))
>>> b
b'\x00\x00\x01\x00\x02\x00\x03\x00\x04\x00\x05\x06\x00\x00\x07\x08\x00\x00\t\x00\n\x00\x0b\x00\x0c\r\x00\x00\x0e\x00'
>>> t = ns.unpack(b)
>>> t
_X_(a=[0, 1, 2, 3], x=x(b=4, y=y(z=[x(c=5, d=6), x(c=7, d=8)], e=9)), f=10, t=t(g=11, u=u(v=v(c=12, d=13), j=14)))
>>> t.x.y.z[1]
x(c=7, d=8)
>>> t.x.y.z[1].d
8
If you are interested, I could put it on github or ...
Let's come back to the hard part. I did some research but for now I can only give you some possible directions, and there will be a heavy work remaining. You could try to use pycparser but it only knows about C (not C++) and documentation is not that extensive. Or if you need C++ you could try CLang Python bindings as proposed in Eli Bendersky's (writer of pycparser) web site Parsing C++ in Python with Clang but here the documentation is almost non existent.
Now for the code, even if I still considere it as a work in progress because of the lack of comments and of formalized tests - but it does work
h_lexer.py
import ply.lex as lex
states = ( ('value', 'exclusive'),)
tokens = ( 'BEG', 'NAME', 'COLON', 'END', 'COMMA', 'TYPE', 'PAD', 'NUMBER')
linepos = 0
def t_COLON(t):
r':'
t.lexer.begin('value')
return t
def t_ANY_BEG(t):
r'{'
t.lexer.begin('INITIAL')
return t
def t_value_TYPE(t):
r'[cbB?hHiIlLqQnNfdp]|\d*[sP]'
t.lexer.begin('INITIAL')
return t
def t_value_PAD(t):
r'x'
t.lexer.begin('INITIAL')
return t
def t_value_NUMBER(t):
r'\d+'
return t
def t_ANY_NAME(t):
r'\w+'
t.lexer.begin('INITIAL')
return t
t_COMMA = ','
t_ANY_ignore= ' \t\r'
t_END=r'}'
def t_newline(t):
r'\n'
t.lexer.lineno += 1
global linepos
linepos = t.lexpos
# Error handling rule
def t_ANY_error(t):
print("Illegal character '%s'" % t.value[0])
t.lexer.skip(1)
def getpos(lexer):
global linepos
return lexer.lineno, lexer.lexpos - linepos
h_yacc.py :
import ply.yacc as yacc
from h_lexer import tokens
named_structs = {}
def p_prog(p):
'prog : named_strucs struc'
p[0] = p[2]
def p_named_structs_empty(p):
'named_strucs : '
def p_named_structs(p):
'named_strucs : named_strucs named_struc'
def p_named_struct(p):
'named_struc : NAME COLON struc'
global named_structs
named_structs[p[1]] = p[3]
def p_struc(p):
'struc : BEG affects END'
# print("STRUCT", p[2])
p[0] = p[2]
def p_typ(p):
'typ : TYPE'
p[0] = p[1]
def p_pad(p):
'pad : PAD'
p[0] = p[1]
def p_pad_n(p):
'pad : NUMBER PAD'
p[0] = p[1] + p[2]
def p_affect_pad(p):
'affect : NAME COLON pad'
p[0] = (None, p[3])
def p_val(p):
'''val : typ
| struc'''
p[0] = p[1]
def p_val_struct(p):
'val : NAME'
test_named_struct(p[1])
p[0] = named_structs[p[1]]
def p_nval_single(p):
'nval : val'
p[0] = p[1]
def p_nval_multi(p):
'nval : NUMBER val'
n = int(p[1])
if n == 0:
raise SyntaxError
if n == 1:
print('WARN: array of one single element')
p[0] = tuple([ p[2] for i in range(n) ])
def p_affect(p):
'affect : NAME COLON nval'
p[0] = (p[1], p[3])
def p_affects_init(p):
'affects : affect'
p[0] = [ p[1] ]
def p_affects_multi(p):
'affects : affects COMMA affect'
p[0] = p[1] + [ p[3] ]
def p_error(p):
print ("Syntax error", p)
def test_named_struct(name):
global named_structs
if name not in named_structs:
raise SyntaxError
h_named_struct.py
import h_yacc
import h_lexer
import collections
import struct
class Named_struct(object):
yacc = h_yacc.yacc.yacc(module = h_yacc)
lexer = h_lexer.lex.lex(module = h_lexer)
def __init__(self, desc_str, name='_X_', struc_typ = '='):
l = self.__class__.yacc.parse(desc_str, self.__class__.lexer)
if l is None:
col, line = h_lexer.getpos(self.__class__.lexer)
raise Exception(
"Syntax error in description string near line %d col %d" %
(line, col - 1))
fmts = []
self.t = self.__make_named(l, fmts, name)
self.fmt = struc_typ + ''.join(fmts)
self.struct = struct.Struct(self.fmt)
self.name = name
self.l = l
def __make_named(self, l, fmts, name):
names = []
values = []
for elt in l:
if elt[0] is None:
fmts.append(elt[1])
else:
val = None
names.append(elt[0])
if isinstance(elt[1], str):
fmts.append(elt[1])
elif isinstance(elt[1], tuple):
val = self.__make_array(elt[1], fmts, elt[0])
else:
val = self.__make_named(elt[1], fmts, elt[0])
values.append(val)
t = collections.namedtuple(name, names)
return t, t(*values)
def __make_array(self, l, fmts, name):
values = []
for elt in l:
if isinstance(elt, str):
fmts.append(elt)
values.append(None)
else:
val = self.__make_named(elt, fmts, "x")
values.append(val)
t = self.__class__.__lister
return t, t(*values)
staticmethod
def __lister(*l):
return list(l)
def __gen(self, t, it):
l = []
for g in t[1]:
if g is None:
l.append(next(it))
else:
l.append(self.__gen(g, it))
return t[0](*l)
def unpack(self, buffer):
l = struct.unpack(self.fmt, buffer)
t = self.__gen(self.t, iter(l))
return t
This is an old thread, but I had the same issue and did not find any library for python that supports nested structs.
Therefore I wrote the pycstruct library.
It supports structs, unions, bitfields and enums. It also supports all byteorders, alignments and nesting at any level.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With