I know that protocol-buffers are a serialized format that requires a message format in the .proto in order to read back properly. But I have a file that I do not know the proper message format for because it isn't published. What I am trying to do is to reverse engineer the data myself so i can reconstruct the messages. To do this I need to read the raw file out where I can pick up the field numbers, types and values.
Is there a program that will do this (preferrably in python but C/C++ is cool too)?
Protobuf treats strings as primitive types and therefore they can not be null.
In particular, it was designed to be smaller and faster than XML. Protocol Buffers are widely used at Google for storing and interchanging all kinds of structured information. The method serves as a basis for a custom remote procedure call (RPC) system that is used for nearly all inter-machine communication at Google.
Working with Protocol BuffersBy default, gRPC uses Protocol Buffers, Google's mature open source mechanism for serializing structured data (although it can be used with other data formats such as JSON).
After doing some digging, I wrote my own tool to do this. There were other ways to do this, I'm sure, but this tool looks at the description in the source binary. It reads in the description stream and spits out a pseudo-.proto file. From that .proto file you can compile your own pb file and decode your stream.
import sys import struct # Helper functions ------------------------------------------------------------ # this comes largely straight out of the google protocol-buffers code for DecodeVarint(internal\decoder.py) # with a few tweaks to make it work for me def readVarInt(buffer, pos): mask = (1 << 64) - 1 result = 0 shift = 0 startPos = pos while 1: b = ord(buffer[pos]) result |= ((b & 0x7f) << shift) pos += 1 if not (b & 0x80): if result > 0x7fffffffffffffff: result -= (1 << 64) result |= ~mask else: result &= mask return (result, pos, pos-startPos) shift += 7 if shift >= 64: raise Error('Too many bytes when decoding varint.') def readQWORD(d, pos): try: v = struct.unpack("<Q", d[pos:pos+8])[0] except: print "Exception in readQWORD" print sys.exc_info() return (None, pos) pos += 8 return (v, pos); def readDWORD(d, pos): try: v = struct.unpack("<L", d[pos:pos+4])[0] except: print "Exception in readDWORD" print sys.exc_info() return (None, pos) pos += 4 return (v, pos); def readBYTE(d, pos): try: v = struct.unpack("<B", d[pos:pos+1])[0] except: print "Exception in readBYTE" print sys.exc_info() return (None, pos) pos += 1 return (v, pos); # returns (value, new position, data type, field ID, and value's length) def readField(d, pos): # read field and type info (v, p) = readBYTE(d, pos); datatype = v & 7; fieldnum = v >> 3; if datatype == 0: # varint (v, p, l) = readVarInt(d, p) return (v, p, datatype, fieldnum, l) elif datatype == 1: # 64-bit (v,p) = readQWORD(d, p) return (v, p, datatype, fieldnum, 8) elif datatype == 2: # varlen string/blob (v, p, l) = readVarInt(d, p) # get string length return (d[p:p+v], p+v, datatype, fieldnum, v) elif datatype == 5: # 32-bit value (v,p) = readDWORD(d, p) return (v, p, datatype, fieldnum, 4) else: print "Unknown type: %d [%x]\n" % (datatype, pos) return (None, p, datatype, fieldnum, 1); # PARSERS --------------------------------------------------------------------- # Parse DescriptorProto field def PrintDescriptorProto(data, size, prefix): pos = 0 while pos < size: (d, p, t, fid, l) = readField(data, pos); pos = p if fid == 1: print "%smessage %s {" % (prefix,d) elif fid == 2: PrintFieldDescriptorProto(d, l, prefix+"\t") # FieldDescriptorProto elif fid == 3: PrintDescriptorProto(d, l, prefix+"\t") # DescriptorProto elif fid == 4: PrintEnumDescriptorProto(d, l, prefix+"\t") # EnumDescriptorProto elif fid == 5: print "%sextension_range:" % (prefix) PrintDescriptorProto(d, l, prefix+"\t") # ExtensionRange elif fid == 6: print "%sextension: %s" % (prefix,d) # FieldDescriptorProto elif fid == 7: print "%soptions: %s" % (prefix,d) # MessageOptions else: print "***UNKNOWN fid in PrintDescriptorProto %d" % fid print "%s}" % prefix # Parse EnumDescriptorProto def PrintEnumDescriptorProto(data, size, prefix): pos = 0 while pos < size: (d, p, t, fid, l) = readField(data, pos); pos = p if fid == 1: print "%senum %s {" % (prefix,d) elif fid == 2: PrintEnumValueDescriptorProto(d, l, prefix+"\t") # EnumValueDescriptorProto elif fid == 3: # EnumOptions print "%soptions" % prefix else: print "***UNKNOWN fid in PrintDescriptorProto %d" % fid print "%s};" % prefix # Parse EnumValueDescriptorProto def PrintEnumValueDescriptorProto(data, size, prefix): pos = 0 enum = {"name": None, "number": None} while pos < size: (d, p, t, fid, l) = readField(data, pos); pos = p if fid == 1: enum['name'] = d elif fid == 2: enum['number'] = d elif fid == 3: # EnumValueOptions print "%soptions: %s" % (prefix,d) else: print "***UNKNOWN fid in PrintDescriptorProto %d" % fid print "%s%s = %s;" % (prefix, enum['name'], enum['number']) # Parse FieldDescriptorProto def PrintFieldDescriptorProto(data, size, prefix): pos = 0 field = {"name": None, "extendee": None, "number": None, "label": None, "type": None, "type_name": None, "default_value": None, "options": None} while pos < size: (d, p, t, fid, l) = readField(data, pos); pos = p if fid == 1: field['name'] = d elif fid == 2: field['extendee'] = d elif fid == 3: field['number'] = d elif fid == 4: if d == 1: field['label'] = "optional" elif d == 2: field['label'] = "required" elif d == 3: field['label'] = "repeated" else: print "{{Label: UNKNOWN (%d)}}" % (prefix,d) elif fid == 5: types = {1: "double", 2: "float", 3: "int64", 4: "uint64", 5: "int32", 6: "fixed64", 7: "fixed32", 8: "bool", 9: "string", 10: "group", 11: "message", 12: "bytes", 13: "uint32", 14: "enum", 15: "sfixed32", 16: "sfixed64", 17: "sint32", 18: "sint64" } if d not in types: print "%sType: UNKNOWN(%d)" % (prefix,d) else: field['type'] = types[d] elif fid == 6: field["type_name"] = d elif fid == 7: field["default_value"] = d elif fid == 8: field["options"] = d else: print "***UNKNOWN fid in PrintFieldDescriptorProto %d" % fid output = prefix if field['label'] is not None: output += " %s" % field['label'] output += " %s" % field['type'] output += " %s" % field['name'] output += " = %d" % field['number'] if field['default_value']: output += " [DEFAULT = %s]" % field['default_value'] output += ";" print output # Parse ExtensionRange field def PrintExtensionRange(data, size, prefix): pos = 0 while pos < size: (d, p, t, fid, l) = readField(data, pos); pos = p print "%stype %d, field %d, length %d" % (prefix, t, fid, l) if fid == 1: print "%sstart: %d" % (prefix,d) elif fid == 2: print "%send: %d" % (prefix,d) else: print "***UNKNOWN fid in PrintExtensionRange %d" % fid def PrintFileOptions(data, size, prefix): pos = 0 while pos < size: (d, p, t, fid, l) = readField(data, pos); pos = p if fid == 1: print "%soption java_package = \"%s\";" % (prefix,d) elif fid == 8: print "%soption java_outer_classname = \"%s\"" % (prefix,d) elif fid == 10: print "%soption java_multiple_files = %d" % (prefix,d) elif fid == 20: print "%soption java_generate_equals_and_hash = %d" % (prefix,d) elif fid == 9: print "%soption optimize_for = %d" % (prefix,d) elif fid == 16: print "%soption cc_generic_services = %d" % (prefix,d) elif fid == 17: print "%soption java_generic_services = %d" % (prefix,d) elif fid == 18: print "%soption py_generic_services = %d" % (prefix,d) elif fid == 999: print "%soption uninterpreted_option = \"%s\"" % (prefix,d) # UninterpretedOption else: print "***UNKNOWN fid in PrintFileOptions %d" % fid # ----------------------------------------------------------------------------- # Main function. def ParseProto(filename, offset, size): f = open(filename, "rb").read() data = f[offset:offset+size] pos = 0 while pos < size: (d, p, t, fid, l) = readField(data, pos); pos = p #print "type %d, field %d, length %d" % (t, fid, l) if fid == 1: print "// source filename: %s" % d elif fid == 2: print "package %s;" % d elif fid == 3: print "import \"%s\"" % d elif fid == 4: PrintDescriptorProto(d, l, "") elif fid == 5: print "EnumDescriptorProto: %s" % d elif fid == 6: print "ServiceDescriptorProto: %s" % d elif fid == 7: print "FieldDescriptorProto: %s" % d elif fid == 8: PrintFileOptions(d, l, "") else: print "***UNKNOWN fid in ParseProto %d" % fid return {} # main if __name__ == "__main__": if len(sys.argv) != 4: print "Usage: %s binaryfile offset size" % sys.argv[0] sys.exit(0) ParseProto(sys.argv[1], int(sys.argv[2]), int(sys.argv[3]))
I found it useful to convert the raw message to text using protoc --decode_raw < file
. If the file actually contains multiple (length-prefixed) messages, save them into separate files first.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With