Question: I am trying to use the espeak text-to-speech engine. So for I got it working wounderfully on linux (code below). Now I wanted to port this basic program to windows, too, but it's nearly impossible...
Part of the problem is that the windows dll only allows for AUDIO_OUTPUT_SYNCHRONOUS, which means it requires a callback, but I can't figure out how to play the audio from the callback... First it crashed, then I realized, I need a callback function, now I get the data in the callback function, but I don't know how to play it... as it is neither a wav file nor plays automatically as on Linux.
The sourceforge site is rather useless, because it basically says use the SAPI version, but then there is no example on how to use the sapi espeak dll...
Anyway, here's my code, can anybody help?
#ifdef __cplusplus
#include <cstdio>
#include <cstdlib>
#include <cstring>
#else
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#endif
#include <assert.h>
#include <ctype.h>
//#include "speak_lib.h"
#include "espeak/speak_lib.h"
// libespeak-dev: /usr/include/espeak/speak_lib.h
// apt-get install libespeak-dev
// apt-get install libportaudio-dev
// g++ -o mine mine.cpp -lespeak
// g++ -o mine mine.cpp -I/usr/include/espeak/ -lespeak
// gcc -o mine mine.cpp -I/usr/include/espeak/ -lespeak
char voicename[40];
int samplerate;
int quiet = 0;
static char genders[4] = {' ','M','F',' '};
//const char *data_path = "/usr/share/"; // /usr/share/espeak-data/
const char *data_path = NULL; // use default path for espeak-data
int strrcmp(const char *s, const char *sub)
{
int slen = strlen(s);
int sublen = strlen(sub);
return memcmp(s + slen - sublen, sub, sublen);
}
char * strrcpy(char *dest, const char *source)
{
// Pre assertions
assert(dest != NULL);
assert(source != NULL);
assert(dest != source);
// tk: parentheses
while((*dest++ = *source++))
;
return(--dest);
}
const char* GetLanguageVoiceName(const char* pszShortSign)
{
#define LANGUAGE_LENGTH 30
static char szReturnValue[LANGUAGE_LENGTH] ;
memset(szReturnValue, 0, LANGUAGE_LENGTH);
for (int i = 0; pszShortSign[i] != '\0'; ++i)
szReturnValue[i] = (char) tolower(pszShortSign[i]);
const espeak_VOICE **voices;
espeak_VOICE voice_select;
voices = espeak_ListVoices(NULL);
const espeak_VOICE *v;
for(int ix=0; (v = voices[ix]) != NULL; ix++)
{
if( !strrcmp( v->languages, szReturnValue) )
{
strcpy(szReturnValue, v->name);
return szReturnValue;
}
} // End for
strcpy(szReturnValue, "default");
return szReturnValue;
} // End function getvoicename
void ListVoices()
{
const espeak_VOICE **voices;
espeak_VOICE voice_select;
voices = espeak_ListVoices(NULL);
const espeak_VOICE *v;
for(int ix=0; (v = voices[ix]) != NULL; ix++)
{
printf("Shortsign: %s\n", v->languages);
printf("age: %d\n", v->age);
printf("gender: %c\n", genders[v->gender]);
printf("name: %s\n", v->name);
printf("\n\n");
} // End for
} // End function getvoicename
int main()
{
printf("Hello World!\n");
const char* szVersionInfo = espeak_Info(NULL);
printf("Espeak version: %s\n", szVersionInfo);
samplerate = espeak_Initialize(AUDIO_OUTPUT_PLAYBACK,0,data_path,0);
strcpy(voicename, "default");
// espeak --voices
strcpy(voicename, "german");
strcpy(voicename, GetLanguageVoiceName("DE"));
if(espeak_SetVoiceByName(voicename) != EE_OK)
{
printf("Espeak setvoice error...\n");
}
static char word[200] = "Hello World" ;
strcpy(word, "TV-fäns aufgepasst, es ist 20 Uhr 15. Zeit für Rambo 3");
strcpy(word, "Unnamed Player wurde zum Opfer von GSG9");
int speed = 220;
int volume = 500; // volume in range 0-100 0=silence
int pitch = 50; // base pitch, range 0-100. 50=normal
// espeak.cpp 625
espeak_SetParameter(espeakRATE, speed, 0);
espeak_SetParameter(espeakVOLUME,volume,0);
espeak_SetParameter(espeakPITCH,pitch,0);
// espeakRANGE: pitch range, range 0-100. 0-monotone, 50=normal
// espeakPUNCTUATION: which punctuation characters to announce:
// value in espeak_PUNCT_TYPE (none, all, some),
espeak_VOICE *voice_spec = espeak_GetCurrentVoice();
voice_spec->gender=2; // 0=none 1=male, 2=female,
//voice_spec->age = age;
espeak_SetVoiceByProperties(voice_spec);
espeak_Synth( (char*) word, strlen(word)+1, 0, POS_CHARACTER, 0, espeakCHARS_AUTO, NULL, NULL);
espeak_Synchronize();
strcpy(voicename, GetLanguageVoiceName("EN"));
espeak_SetVoiceByName(voicename);
strcpy(word, "Geany was fragged by GSG9 Googlebot");
strcpy(word, "Googlebot");
espeak_Synth( (char*) word, strlen(word)+1, 0, POS_CHARACTER, 0, espeakCHARS_AUTO, NULL, NULL);
espeak_Synchronize();
espeak_Terminate();
printf("Espeak terminated\n");
return EXIT_SUCCESS;
}
/*
if(espeak_SetVoiceByName(voicename) != EE_OK)
{
memset(&voice_select,0,sizeof(voice_select));
voice_select.languages = voicename;
if(espeak_SetVoiceByProperties(&voice_select) != EE_OK)
{
fprintf(stderr,"%svoice '%s'\n",err_load,voicename);
exit(2);
}
}
*/
#ifdef __cplusplus
#include <cstdio>
#include <cstdlib>
#include <cstring>
#else
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#endif
#include <assert.h>
#include <ctype.h>
#include "speak_lib.h"
//#include "espeak/speak_lib.h"
// libespeak-dev: /usr/include/espeak/speak_lib.h
// apt-get install libespeak-dev
// apt-get install libportaudio-dev
// g++ -o mine mine.cpp -lespeak
// g++ -o mine mine.cpp -I/usr/include/espeak/ -lespeak
// gcc -o mine mine.cpp -I/usr/include/espeak/ -lespeak
char voicename[40];
int iSampleRate;
int quiet = 0;
static char genders[4] = {' ','M','F',' '};
//const char *data_path = "/usr/share/"; // /usr/share/espeak-data/
//const char *data_path = NULL; // use default path for espeak-data
const char *data_path = "C:\\Users\\Username\\Desktop\\espeak-1.43-source\\espeak-1.43-source\\";
int strrcmp(const char *s, const char *sub)
{
int slen = strlen(s);
int sublen = strlen(sub);
return memcmp(s + slen - sublen, sub, sublen);
}
char * strrcpy(char *dest, const char *source)
{
// Pre assertions
assert(dest != NULL);
assert(source != NULL);
assert(dest != source);
// tk: parentheses
while((*dest++ = *source++))
;
return(--dest);
}
const char* GetLanguageVoiceName(const char* pszShortSign)
{
#define LANGUAGE_LENGTH 30
static char szReturnValue[LANGUAGE_LENGTH] ;
memset(szReturnValue, 0, LANGUAGE_LENGTH);
for (int i = 0; pszShortSign[i] != '\0'; ++i)
szReturnValue[i] = (char) tolower(pszShortSign[i]);
const espeak_VOICE **voices;
espeak_VOICE voice_select;
voices = espeak_ListVoices(NULL);
const espeak_VOICE *v;
for(int ix=0; (v = voices[ix]) != NULL; ix++)
{
if( !strrcmp( v->languages, szReturnValue) )
{
strcpy(szReturnValue, v->name);
return szReturnValue;
}
} // End for
strcpy(szReturnValue, "default");
return szReturnValue;
} // End function getvoicename
void ListVoices()
{
const espeak_VOICE **voices;
espeak_VOICE voice_select;
voices = espeak_ListVoices(NULL);
const espeak_VOICE *v;
for(int ix=0; (v = voices[ix]) != NULL; ix++)
{
printf("Shortsign: %s\n", v->languages);
printf("age: %d\n", v->age);
printf("gender: %c\n", genders[v->gender]);
printf("name: %s\n", v->name);
printf("\n\n");
} // End for
} // End function getvoicename
/* Callback from espeak. Directly speaks using AudioTrack. */
#define LOGI(x) printf("%s\n", x)
static int AndroidEspeakDirectSpeechCallback(short *wav, int numsamples, espeak_EVENT *events)
{
char buf[100];
sprintf(buf, "AndroidEspeakDirectSpeechCallback: %d samples", numsamples);
LOGI(buf);
if (wav == NULL)
{
LOGI("Null: speech has completed");
}
if (numsamples > 0)
{
//audout->write(wav, sizeof(short) * numsamples);
sprintf(buf, "AudioTrack wrote: %d bytes", sizeof(short) * numsamples);
LOGI(buf);
}
return 0; // continue synthesis (1 is to abort)
}
static int AndroidEspeakSynthToFileCallback(short *wav, int numsamples,espeak_EVENT *events)
{
char buf[100];
sprintf(buf, "AndroidEspeakSynthToFileCallback: %d samples", numsamples);
LOGI(buf);
if (wav == NULL)
{
LOGI("Null: speech has completed");
}
// The user data should contain the file pointer of the file to write to
//void* user_data = events->user_data;
FILE* user_data = fopen ( "myfile1.wav" , "ab" );
FILE* fp = static_cast<FILE *>(user_data);
// Write all of the samples
fwrite(wav, sizeof(short), numsamples, fp);
return 0; // continue synthesis (1 is to abort)
}
int main()
{
printf("Hello World!\n");
const char* szVersionInfo = espeak_Info(NULL);
printf("Espeak version: %s\n", szVersionInfo);
iSampleRate = espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, 4096, data_path, 0);
if (iSampleRate <= 0)
{
printf("Unable to initialize espeak");
return EXIT_FAILURE;
}
//samplerate = espeak_Initialize(AUDIO_OUTPUT_PLAYBACK,0,data_path,0);
//ListVoices();
strcpy(voicename, "default");
// espeak --voices
//strcpy(voicename, "german");
//strcpy(voicename, GetLanguageVoiceName("DE"));
if(espeak_SetVoiceByName(voicename) != EE_OK)
{
printf("Espeak setvoice error...\n");
}
static char word[200] = "Hello World" ;
strcpy(word, "TV-fäns aufgepasst, es ist 20 Uhr 15. Zeit für Rambo 3");
strcpy(word, "Unnamed Player wurde zum Opfer von GSG9");
int speed = 220;
int volume = 500; // volume in range 0-100 0=silence
int pitch = 50; // base pitch, range 0-100. 50=normal
// espeak.cpp 625
espeak_SetParameter(espeakRATE, speed, 0);
espeak_SetParameter(espeakVOLUME,volume,0);
espeak_SetParameter(espeakPITCH,pitch,0);
// espeakRANGE: pitch range, range 0-100. 0-monotone, 50=normal
// espeakPUNCTUATION: which punctuation characters to announce:
// value in espeak_PUNCT_TYPE (none, all, some),
//espeak_VOICE *voice_spec = espeak_GetCurrentVoice();
//voice_spec->gender=2; // 0=none 1=male, 2=female,
//voice_spec->age = age;
//espeak_SetVoiceByProperties(voice_spec);
//espeak_SetSynthCallback(AndroidEspeakDirectSpeechCallback);
espeak_SetSynthCallback(AndroidEspeakSynthToFileCallback);
unsigned int unique_identifier;
espeak_ERROR err = espeak_Synth( (char*) word, strlen(word)+1, 0, POS_CHARACTER, 0, espeakCHARS_AUTO, &unique_identifier, NULL);
err = espeak_Synchronize();
/*
strcpy(voicename, GetLanguageVoiceName("EN"));
espeak_SetVoiceByName(voicename);
strcpy(word, "Geany was fragged by GSG9 Googlebot");
strcpy(word, "Googlebot");
espeak_Synth( (char*) word, strlen(word)+1, 0, POS_CHARACTER, 0, espeakCHARS_AUTO, NULL, NULL);
espeak_Synchronize();
*/
// espeak_Cancel();
espeak_Terminate();
printf("Espeak terminated\n");
system("pause");
return EXIT_SUCCESS;
}
eSpeak converts text to phonemes with pitch and length information. Can translate text into phoneme codes, so it could be adapted as a front end for another speech synthesis engine. Potential for other languages. Several are included in varying stages of progress.
Download the latest version of eSpeak for Windows from Sourceforge. The installation files will be in your downloads directory in a zip archive. Double click the zip archive to open it, and transfer the contents to the desktop. Read the release notes, then double-click setup_espeak.exe to start the installation.
To use this Download and install eSpeak. Then open the command prompt and run cd C:\Program Files\eSpeak\command_line\ then run the commands below: espeak "Hello World!" This is the simplest command.
Use the -v option to specifiy a voice. After that you can provide the type of language, such as en or en-us. After that, add a plus, then either m or f , and a 1 – 5 .
Have you tried passing the buffer you obtain in your callback to sndplaysnd()??
Declare Function sndPlaySound Lib "winmm.dll" Alias "sndPlaySoundA" (ByVal lpszSoundName As String, ByVal uFlags As Long) As Long
Its standard winAPI is as follows:
sndPlaySound(buffer[0], SND_ASYNC | SND_MEMORY)
Alternately, if you have a wav-file that has the audio to play:
sndPlaySound(filename, SND_ASYNC)
playsound has a ASYNC mode that wouldn't block your program's execution while the audio is being played.
NOTE: I have used it in VB and the above snippets are for use in VB. If you are coding in VC++, you might have to modify them accordingly. But the basic intention remains the same; to pass the buffer to sndPlaySound with the ASYNC flag set.
Good LUCK!!
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With