Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Send microphone audio recorder from browser to google speech to text - Javascript

Send microphone audio recorder from browser to google speech to text. There is no need for streaming and socket, nor is there a need to do it with an HTTP request via Node.js to Google server nor via an HTTP request from client (Browser) side.

The problem I face:

The client-side implementation is done, as well as the server-side implementation. Both implementations work independently from one another. I am getting audio data from the microphone and I am able to play it, as well as being able to test the server-side implementation using the audio.raw sample given by Google.

However, when I try to send the microphone data from the browser to my node server and then to the Google server, I receive encoding problem: "Getting an empty response from google server".

My question is how can I change the encoding of the audio file and then send it to Google Speech to Text server using Javascript.

like image 627
VnoitKumar Avatar asked Dec 07 '22 11:12

VnoitKumar


2 Answers

I've played around with the and I can get speech to text working using the Google API and browser audio recording. I'm wondering if the config object might have been the cause of the issues you encountered.

The components I've used are a Node.js Server: server.js and a simple client (index.html and client-app.js). All in the same folder.

I'm using the Google Speech to Text Client Library for this, so you'll need to add a Google API key file (APIKey.json) to provide credentials.

If you run the Node server, then point your browser to http://localhost:3000/, that should allow you to test the code.

I've drawn a lot of the client-side code from here, using Matt Diamond's Recorder.js code too.

server.js

const express = require('express');
const multer = require('multer');
const fs = require('fs');

const upload = multer();

const app = express();
const port = 3000;

app.use(express.static('./'));

async function testGoogleTextToSpeech(audioBuffer) {
    const speech = require('@google-cloud/speech');
    const client = new speech.SpeechClient( { keyFilename: "APIKey.json"});

    const audio = {
    content: audioBuffer.toString('base64'),
    };
    const config = {
    languageCode: 'en-US',
    };
    const request = {
    audio: audio,
    config: config,
    };

    const [response] = await client.recognize(request);
    const transcription = response.results
    .map(result => result.alternatives[0].transcript)
    .join('\n');
    return transcription;
}

app.post('/upload_sound', upload.any(), async (req, res) => {
    console.log("Getting text transcription..");
    let transcription = await testGoogleTextToSpeech(req.files[0].buffer);
    console.log("Text transcription: " + transcription);
    res.status(200).send(transcription);
});

app.listen(port, () => {
    console.log(`Express server listening on port: ${port}...`);
});

index.html

<!DOCTYPE html>
<html>
<head>
    <meta charset="UTF-8">
    <title>Speech to text test</title>
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <link rel="stylesheet" type="text/css" href="https://bootswatch.com/4/cerulean/bootstrap.min.css">
</head>
<body style="padding:50px;">
    <h1>Speech to text test</h1>
    <div id="controls">
    <button id="recordButton">Record</button>
    <button id="transcribeButton" disabled>Get transcription</button>
    </div>
    <div id="output"></div>
    <script src="https://cdn.rawgit.com/mattdiamond/Recorderjs/08e7abd9/dist/recorder.js"></script>
    <script src="client-app.js"></script>
</body>
</html>

client-app.js

let rec = null;
let audioStream = null;

const recordButton = document.getElementById("recordButton");
const transcribeButton = document.getElementById("transcribeButton");

recordButton.addEventListener("click", startRecording);
transcribeButton.addEventListener("click", transcribeText);

function startRecording() {

    let constraints = { audio: true, video:false }

    recordButton.disabled = true;
    transcribeButton.disabled = false;

    navigator.mediaDevices.getUserMedia(constraints).then(function(stream) {
        const audioContext = new window.AudioContext();
        audioStream = stream;
        const input = audioContext.createMediaStreamSource(stream);
        rec = new Recorder(input, { numChannels:1 })
        rec.record()
    }).catch(function(err) {
        recordButton.disabled = false;
        transcribeButton.disabled = true;
    });
}

function transcribeText() {
    transcribeButton.disabled = true;
    recordButton.disabled = false;
    rec.stop();
    audioStream.getAudioTracks()[0].stop();
    rec.exportWAV(uploadSoundData);
}

function uploadSoundData(blob) {
    let filename = new Date().toISOString();
    let xhr = new XMLHttpRequest();
    xhr.onload = function(e) {
        if(this.readyState === 4) {
            document.getElementById("output").innerHTML = `<br><br><strong>Result: </strong>${e.target.responseText}`
        }
    };
    let formData = new FormData();
    formData.append("audio_data", blob, filename);
    xhr.open("POST", "/upload_sound", true);
    xhr.send(formData);
}
like image 57
Terry Lennox Avatar answered Dec 09 '22 23:12

Terry Lennox


@terry-lennox thank you so much. For the Clear Answer.

But I am using React as my Front End, So got an npm package called recorder-js

And the code is for reference who see this post in the future.

import Recorder from 'recorder-js';

import micGrey from './mic-grey.svg';
import micWhite from './mic-white.svg';

import './App.css';

var recorder = null;
var audioStream = null;

class App extends Component {
  constructor(props) {
    super(props);
    this.mic = React.createRef();

    this.accessMic = this.accessMic.bind(this);
    this.handleClick = this.handleClick.bind(this);
    this.handleClick = this.handleClick.bind(this);
    this.handleSuccess = this.handleSuccess.bind(this);

    this.stopAccessingMic = this.stopAccessingMic.bind(this);
    this.getTextFromGoogle = this.getTextFromGoogle.bind(this);

    this.state = {
      isMicActive: false
    };
  }

  accessMic() {
    const audioContext = new (window.AudioContext ||
      window.webkitAudioContext)();

    recorder = new Recorder(audioContext);

    navigator.mediaDevices
      .getUserMedia({ audio: true })
      .then(this.handleSuccess)
      .catch(err => console.log('Uh oh... unable to get stream...', err));
  }

  handleSuccess(stream) {
    audioStream = stream;

    recorder.init(stream);
    recorder.start();
  }

  getTextFromGoogle(blob) {
    let filename = new Date().toISOString();
    let xhr = new XMLHttpRequest();
    xhr.onload = function(e) {
      if (this.readyState === 4) {
        console.log(e.target.responseText);
      }
    };
    let formData = new FormData();
    formData.append('audio_data', blob, filename);
    xhr.open('POST', 'http://localhost:3000/', true);
    xhr.send(formData);
  }

  handleClick() {
    const isMicActive = this.state.isMicActive;

    this.setState({
      isMicActive: !isMicActive
    });

    if (!isMicActive) {
      this.checkPermissions();
      this.accessMic();
    } else {
      this.stopAccessingMic();
    }
  }

  stopAccessingMic() {
    audioStream && audioStream.getTracks()[0].stop();
    recorder.stop().then(({ blob, buffer }) => {
      this.getTextFromGoogle(blob);
    });
  }

  checkPermissions() {
    navigator.permissions
      .query({ name: 'microphone' })
      .then(permissionObj => {
        console.log('Permission status - ', permissionObj.state);
      })
      .catch(error => {
        console.log('Permission status - Got error :', error);
      });
  }

  render() {
    return (
      <div className='App'>
        <div
          id='mic'
          ref={this.mic}
          onClick={this.handleClick}
          className={
            this.state.isMicActive ? 'mic-btn mic-btn-active' : 'mic-btn'
          }
        >
          <img src={this.state.isMicActive ? micWhite : micGrey} alt='mic' />
        </div>
      </div>
    );
  }
}
export default App;

And the back-end code for the reference, there was a small change I was facing and the error is Must use single channel (mono) audio to fix this I referred Link, Link. Need to add audioChannelCount: 2 in config.

var router = express.Router();
const multer = require('multer');
const fs = require('fs');

const upload = multer();

process.env.GOOGLE_APPLICATION_CREDENTIALS =
  'C:/Users/user/Desktop/Speech-to-Text-e851cb3889e5.json';

/* GET home page. */
router.post('/', upload.any(), async (req, res, next) => {
  console.log('Getting text transcription..');
  try {
    let transcription = await testGoogleTextToSpeech(req.files[0].buffer);
    console.log('Text transcription: ' + transcription);
    res.status(200).send(transcription);
  } catch (error) {
    console.log(error);
    res.status(400).send(error);
  }
});

async function testGoogleTextToSpeech(audioBuffer) {
  const speech = require('@google-cloud/speech');
  const client = new speech.SpeechClient();

  const audio = {
    content: audioBuffer.toString('base64')
  };
  const config = {
    languageCode: 'en-US',
    audioChannelCount: 2
  };
  const request = {
    audio: audio,
    config: config
  };

  try {
    const [response] = await client.recognize(request);
    const transcription = response.results
      .map(result => result.alternatives[0].transcript)
      .join('\n');
    return transcription;
  } catch (error) {
    return error;
  }
}
module.exports = router;
like image 33
VnoitKumar Avatar answered Dec 10 '22 01:12

VnoitKumar