Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Stuttering/"robotic" audio in WAV recordings via Web Audio API

I am using the Web Audio API to capture WAVE audio clips of audio in-browser from users of an application while they hold down a particular key (e.g., push-to-talk) (update: also happens when not using push-to-talk). The audio in many of the recordings stutters; you can hear an example here (start at ~5 seconds) and a different example here. What can I do to diagnose (or fix) this issue? (I've toyed with the buffer size in createScriptProcessor quite a bit to no avail.)

The computers using the application are all MacBook Pros running Chrome 36 on OS X 10.8 or 10.9 (update: also Chrome 39/40 on 10.10). Here's chrome://version for the machine that recorded the sample linked above:

Google Chrome:   36.0.1985.143 (Official Build 287914) 
OS:              Mac OS X 
Blink:           537.36 (@179211)
JavaScript:      V8 3.26.31.15
Flash:           14.0.0.177
User Agent:      Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36
Command Line:    /Applications/Google Chrome.app/Contents/MacOS/Google Chrome --flag-switches-begin --flag-switches-end
Executable Path: /Applications/Google Chrome.app/Contents/MacOS/Google Chrome
Profile Path:    /Users/jason/Library/Application Support/Google/Chrome/Default
Variations:      e950616e-37fb3cc2
                 8afebf76-771ac34e
                 c70841c8-4866ef6e
                 195ce1b5-d93a0620
                 c4126e6a-ca7d8d80
                 9e5c75f1-ad69ceb0
                 262f996f-7075cd8
                 24dca50e-837c4893
                 ca65a9fe-91ac3782
                 8d790604-9cb2a91c
                 4ea303a6-3d47f4f4
                 d8f57532-3f4a17df
                 b2612322-f8cf70e2
                 5a3c10b5-e1cc0f14
                 244ca1ac-4ad60575
                 f47ae82a-86f22ee5
                 5e29d81-cf4f6ead
                 3ac60855-486e2a9c
                 246fb659-6e597ede
                 f296190c-65255996
                 4442aae2-6e597ede
                 ed1d377-e1cc0f14
                 75f0f0a0-a5822863
                 e2b18481-d7f6b13c
                 e7e71889-4ad60575
                 cbf0c14e-bf3e6cfd

The relevant parts of the code that make the recordings are as follows (simplified slightly):

function startRecording() {
  navigator.getUserMedia({audio: true, video: false}, function (stream) {
    audioContext = audioContext || new window.webkitAudioContext();
    input = audioContext.createMediaStreamSource(stream);
    node = input.context.createScriptProcessor(4096, 1, 1);

    input.connect(node);
    node.connect(audioContext.destination);

    this.worker = new Worker(this.workerUrl); // see Web Worker code, below
    this.worker.addEventListener("message", this.handleWorkerMessage.bind(this));
    this.worker.postMessage({command: "init"});

    node.addEventListener("audioprocess", this.onAudioProcess);
  });
}

function stopRecording() {
  this.recording = false;
  this.worker.postMessage({command: "end"});
}

function onAudioProcess = function(evt) {
  if (!this.recording || stream.ended) return;

  var channelLeft = evt.inputBuffer.getChannelData(0);
  channelLeft = new Float32Array(channelLeft);
  this.worker.postMessage({command: "encode", buffer: channelLeft});
}

function handleWorkerMessage = function(evt) {
  var data = evt.data;
  switch (data.command) {
  case "end":
    this.appendToBuffer(data.buffer);
    var view;
    try {
      view = new DataView(this.buffer);
      var blob = new Blob([view], {type: this.mimeType});
      this.callback(blob);
    } finally {
      this.worker.terminate();
      node.removeEventListener("audioprocess", this.onAudioProcess);
    }
    break;
  }
}

function appendToBuffer = function(buffer) {
  if (!this.buffer) {
    this.buffer = buffer;
  } else {
    var tmp = new Uint8Array(this.buffer.byteLength + buffer.byteLength);
    tmp.set(new Uint8Array(this.buffer), 0);
    tmp.set(new Uint8Array(buffer), this.buffer.byteLength);
    this.buffer = tmp.buffer;
  }
}

Here is the Web Worker that saves the buffers passed to it and builds the WAV at the end (this code heavily borrows from RecordRTC):

var buffers,
    length = 0,
    sampleRate = 44100;

function concatBuffers(buffers, totalLength) {
  var buf;
  var result = new Float32Array(totalLength);
  var offset = 0;
  var lng = buffers.length;
  for (var i = 0; i < lng; i++) {
    var buf = buffers[i];
    result.set(buf, offset);
    offset += buf.length;
  }
  return result;
}

function writeUTFBytes(view, offset, string) {
  var lng = string.length;
  for (var i = 0; i < lng; i++) {
    view.setUint8(offset + i, string.charCodeAt(i));
  }
}

this.addEventListener("message", function(evt) {
  var data = evt.data;

  switch (data.command) {
  case "init":
    buffers = [];
    break;
  case "encode":
    buffers.push(new Float32Array(data.buffer));
    length += data.buffer.length;
    break;
  case "end":
    var pcmBuffer = concatBuffers(buffers, length);
    var wavBuffer = new ArrayBuffer(44 + pcmBuffer.length * 2);
    var view = new DataView(wavBuffer);

    // RIFF chunk descriptor
    writeUTFBytes(view, 0, "RIFF");
    view.setUint32(4, 44 + pcmBuffer.length * 2, true);
    writeUTFBytes(view, 8, 'WAVE');

    // FMT sub-chunk
    writeUTFBytes(view, 12, 'fmt ');
    view.setUint32(16, 16, true);
    view.setUint16(20, 1, true);

    view.setUint16(22, 1, true); // one channel
    view.setUint32(24, sampleRate, true);
    view.setUint32(28, sampleRate * 4, true);
    view.setUint16(32, 4, true);
    view.setUint16(34, 16, true);

    // data sub-chunk
    writeUTFBytes(view, 36, 'data');
    view.setUint32(40, pcmBuffer.length * 2, true);

    // PCM samples
    var lng = pcmBuffer.length;
    var index = 44;
    volume = 1;
    for (var i = 0; i < lng; i++) {
      view.setInt16(index, pcmBuffer[i] * (0x7FFF * volume), true);
      index += 2;
    }

    this.postMessage({command: "end", buffer: wavBuffer});
    break;
  }
});

Update

We started using this same technique on another product, and are seeing the same behavior, and we don't use a push-to-talk system in this product.

like image 232
Michelle Tilley Avatar asked Aug 21 '14 20:08

Michelle Tilley


2 Answers

I've noticed Chrome, unlike Firefox, seems happy to drop audio frames when a lot is happening on a page. Although you won't be notified when this happens, if you use the performance measuring tools it's possible to see what might be causing the bottleneck.

As JavaScript is single threaded (and you can't create a script processor in a worker), event handlers that take a lot of time to complete (or other things happening elsewhere in your app) can make the problem worse.

If you increase the bufferSize of your script processor to 16384 (the maximum), Chrome seems to drop less frames.

like image 127
Jamie Avatar answered Oct 11 '22 18:10

Jamie


I have a VERY strong feeling, based on the audio and the time delay, that this is actually your key handler - it sounds like maybe code around holding-down-the-key-to-talk is causing a start/stop storm?

like image 24
cwilso Avatar answered Oct 11 '22 16:10

cwilso