I'm given an std::istream
that contains an UTF-16 encoded string. Imagine an UTF-16 encoded text file that has been opened like this:
std::ifstream file( "mytext_utf16.txt", std::ios::binary );
I want to pass this stream to a function that takes a std::wistream&
parameter. I cannot change the file stream type to std::wifstream.
Question: Are there any facilities in the standard or boost libraries that enable me to "reinterpret" the istream as a wistream?
I'm imagining an adapter class similar to std::wbuffer_convert except that it shouldn't do any encoding conversion. Basically for each wchar_t that is read from the adapter class, it should just read two bytes from the associated istream and reinterpret_cast
them to wchar_t.
I have created an implementation using boost::iostreams that can be used like this and works like a charm:
std::ifstream file( "mytext_utf16.txt", std::ios::binary );
// Create an instance of my adapter class.
reinterpret_as_wide_stream< std::ifstream > wfile( &file );
// Read a wstring from file, using the adapter.
std::wstring str;
std::get_line( wfile, str );
Why am I asking then? Because I like to reuse existing code instead of reinventing the wheel.
As there are no other answers yet, I'm posting my solution that uses the Boost.Iostreams library. Although it is pretty straightforward I still think there should be a simpler solution.
First we create a template class that models the Boost.Iostreams device concept and serves as an adapter for an associated narrow device. It forwards the read, write and seek operations to the associated device but adjusts stream position and size values to accomodate for the difference in size between the narrow and the wide character types.
"basic_reinterpret_device.h"
#pragma once
#include <boost/iostreams/traits.hpp>
#include <boost/iostreams/read.hpp>
#include <boost/iostreams/write.hpp>
#include <boost/iostreams/seek.hpp>
// CategoryT: boost.iostreams device category tag
// DeviceT : type of associated narrow device
// CharT : (wide) character type of this device adapter
template< typename CategoryT, typename DeviceT, typename CharT >
class basic_reinterpret_device
{
public:
using category = CategoryT; // required by boost::iostreams device concept
using char_type = CharT; // required by boost::iostreams device concept
using associated_device = DeviceT;
using associated_char_type = typename boost::iostreams::char_type_of< DeviceT >::type;
static_assert( sizeof( associated_char_type ) == 1, "Associated device must have a byte-sized char_type" );
// Default constructor.
basic_reinterpret_device() = default;
// Construct from a narrow device
explicit basic_reinterpret_device( DeviceT* pDevice ) :
m_pDevice( pDevice ) {}
// Get the asociated device.
DeviceT* get_device() const { return m_pDevice; }
// Read up to n characters from the underlying data source into the buffer s,
// returning the number of characters read; return -1 to indicate EOF
std::streamsize read( char_type* s, std::streamsize n )
{
ThrowIfDeviceNull();
std::streamsize bytesRead = boost::iostreams::read(
*m_pDevice,
reinterpret_cast<associated_char_type*>( s ),
n * sizeof( char_type ) );
if( bytesRead == static_cast<std::streamsize>( -1 ) ) // EOF
return bytesRead;
return bytesRead / sizeof( char_type );
}
// Write up to n characters from the buffer s to the output sequence, returning the
// number of characters written.
std::streamsize write( const char_type* s, std::streamsize n )
{
ThrowIfDeviceNull();
std::streamsize bytesWritten = boost::iostreams::write(
*m_pDevice,
reinterpret_cast<const associated_char_type*>( s ),
n * sizeof( char_type ) );
return bytesWritten / sizeof( char_type );
}
// Advances the read/write head by off characters, returning the new position,
// where the offset is calculated from:
// - the start of the sequence if way == ios_base::beg
// - the current position if way == ios_base::cur
// - the end of the sequence if way == ios_base::end
std::streampos seek( std::streamoff off, std::ios_base::seekdir way )
{
ThrowIfDeviceNull();
std::streampos newPos = boost::iostreams::seek( *m_pDevice, off * sizeof( char_type ), way );
return newPos / sizeof( char_type );
}
protected:
void ThrowIfDeviceNull()
{
if( ! m_pDevice )
throw std::runtime_error( "basic_reinterpret_device - no associated device" );
}
private:
DeviceT* m_pDevice = nullptr;
};
To simplify usage of this template, we create some alias templates for the most common Boost.Iostreams device tags. Based on these we create alias templates to build standard-compatible stream buffers and streams.
"reinterpret_stream.h"
#pragma once
#include "basic_reinterpret_device.h"
#include <boost/iostreams/categories.hpp>
#include <boost/iostreams/traits.hpp>
#include <boost/iostreams/stream.hpp>
#include <boost/iostreams/stream_buffer.hpp>
struct reinterpret_device_tag : virtual boost::iostreams::source_tag, virtual boost::iostreams::sink_tag {};
struct reinterpret_source_seekable_tag : boost::iostreams::device_tag, boost::iostreams::input_seekable {};
struct reinterpret_sink_seekable_tag : boost::iostreams::device_tag, boost::iostreams::output_seekable {};
template< typename DeviceT, typename CharT >
using reinterpret_source = basic_reinterpret_device< boost::iostreams::source_tag, DeviceT, CharT >;
template< typename DeviceT, typename CharT >
using reinterpret_sink = basic_reinterpret_device< boost::iostreams::sink_tag, DeviceT, CharT >;
template< typename DeviceT, typename CharT >
using reinterpret_device = basic_reinterpret_device< reinterpret_device_tag, DeviceT, CharT >;
template< typename DeviceT, typename CharT >
using reinterpret_device_seekable = basic_reinterpret_device< boost::iostreams::seekable_device_tag, DeviceT, CharT >;
template< typename DeviceT, typename CharT >
using reinterpret_source_seekable =
basic_reinterpret_device< reinterpret_source_seekable_tag, DeviceT, CharT >;
template< typename DeviceT, typename CharT >
using reinterpret_sink_seekable =
basic_reinterpret_device< reinterpret_sink_seekable_tag, DeviceT, CharT >;
template< typename DeviceT >
using reinterpret_as_wistreambuf = boost::iostreams::stream_buffer< reinterpret_source_seekable< DeviceT, wchar_t > >;
template< typename DeviceT >
using reinterpret_as_wostreambuf = boost::iostreams::stream_buffer< reinterpret_sink_seekable< DeviceT, wchar_t > >;
template< typename DeviceT >
using reinterpret_as_wstreambuf = boost::iostreams::stream_buffer< reinterpret_device_seekable< DeviceT, wchar_t > >;
template< typename DeviceT >
using reinterpret_as_wistream = boost::iostreams::stream< reinterpret_source_seekable< DeviceT, wchar_t > >;
template< typename DeviceT >
using reinterpret_as_wostream = boost::iostreams::stream< reinterpret_sink_seekable< DeviceT, wchar_t > >;
template< typename DeviceT >
using reinterpret_as_wstream = boost::iostreams::stream< reinterpret_device_seekable< DeviceT, wchar_t > >;
Usage examples:
#include "reinterpret_stream.h"
void read_something_as_utf16( std::istream& input )
{
reinterpret_as_wistream< std::istream > winput( &input );
std::wstring wstr;
std::getline( winput, wstr );
}
void write_something_as_utf16( std::ostream& output )
{
reinterpret_as_wostream< std::ostream > woutput( &output );
woutput << L"сайт вопросов и ответов для программистов";
}
This is work in progress
This is nothing you should use, but probably a hint with what you can start, if you didn't thought about doing such a thing yet. If this is not helpful or when you can work out a better solution I am glad to remove or extend this answer.
As far as I understand you want to read a UTF-8 file and simply cast each single character into wchar_t.
If it is too much what the standard facilities do, couldn't you write your own facet.
#include <codecvt>
#include <locale>
#include <fstream>
#include <cwchar>
#include <iostream>
#include <fstream>
class MyConvert
{
public:
using state_type = std::mbstate_t;
using result = std::codecvt_base::result;
using From = char;
using To = wchar_t;
bool always_noconv() const throw() {
return false;
}
result in(state_type& __state, const From* __from,
const From* __from_end, const From*& __from_next,
To* __to, To* __to_end, To*& __to_next) const
{
while (__from_next != __from_end) {
*__to_next = static_cast<To>(*__from_next);
++__to_next;
++__from_next;
}
return result::ok;
}
result out(state_type& __state, const To* __from,
const To* __from_end, const To*& __from_next,
From* __to, From* __to_end, From*& __to_next) const
{
while (__from_next < __from_end) {
std::cout << __from << " " << __from_next << " " << __from_end << " " << (void*)__to <<
" " << (void*)__to_next << " " << (void*)__to_end << std::endl;
if (__to_next >= __to_end) {
std::cout << "partial" << std::endl;
std::cout << "__from_next = " << __from_next << " to_next = " <<(void*) __to_next << std::endl;
return result::partial;
}
To* tmp = reinterpret_cast<To*>(__to_next);
*tmp = *__from_next;
++tmp;
++__from_next;
__to_next = reinterpret_cast<From*>(tmp);
}
return result::ok;
}
};
int main() {
std::ofstream of2("test2.out");
std::wbuffer_convert<MyConvert, wchar_t> conv(of2.rdbuf());
std::wostream wof2(&conv);
wof2 << L"сайт вопросов и ответов для программистов";
wof2.flush();
wof2.flush();
}
This is nothing you should use in your code. If this goes in the right direction, you need to read the documentations, including what is needed for this facet, what all this pointers mean, and how you need to write to them.
If you want to use something like this, you need to think about which template arguments you should use for the facet (if any).
Update I've now updated my code. The out-function is now closer to what we want I think. It is not beautiful and just a test code, and I am still unsure why __from_next
is not updated (or kept).
Currently the problem is that we cannot write to the stream. With gcc we just fall out of the sync of the wbuffer_convert, for clang we get an SIGILL.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With