I have already learned that readline can be used to read the file line by line, e.g.
readline
.createInterface({input: fs.createReadStream('xxx')})
.on('line', (line) => { apply_regexp_on_line })
.on('close', () => { report_all_regexps });
However, this is pretty slow, since I compared the performance of grep
and JavaScript regexp, and the latter has better performance on the regexps I tested. (see benchmark) So I think I have to blame the node async readline.
In my situation, I do not care async at all, I just need to exploit the fast regexp from JavaScript to process very large log files (typically 1-2GB, sometimes up to 10GB). What is the best way of doing this? My only concern is speed.
Bonus points: some of the log files are gzipped, so I need to uncompress them. If someone can recommend me a fast line-by-line reader for both plain text and gzipped text exists, I would be really appreciated.
const readline = require('readline'); Since readline module works only with Readable streams, so we need to first create a readable stream using the fs module. // but from a readable stream only. The line-reader module provides eachLine() method which reads the file line by line.
Method 1: Read a File Line by Line using readlines() readlines() is used to read all the lines at a single go and then return them as each line a string element in a list. This function can be used for small files, as it reads the whole file content to the memory, then split it into separate lines.
var fs = require('fs'); var readline = require('readline'); var stream = require('stream'); var instream = fs. createReadStream('your/file'); var outstream = new stream; var rl = readline. createInterface(instream, outstream); rl. on('line', function(line) { // process line here }); rl.
In all current versions of Node. js, readline. createInterface can be used as an async iterable, to read a file line by line - or just for the first line. This is also safe to use with empty files.
How does this hold up against your data?
// module linegrep.js
'use strict';
var through2 = require('through2');
var StringDecoder = require('string_decoder').StringDecoder
function grep(regex) {
var decoder = new StringDecoder('utf8'),
last = "",
lineEnd = /\r?\n/;
var stream = through2({}, function transform(chunk, enc, cb) {
var lines = decoder.write(last + chunk).split(lineEnd), i;
last = lines.pop();
for (i = 0; i < lines.length; i++) {
if (regex.test(lines[i])) this.push(lines[i]);
}
cb();
}, function flush(cb) {
if (regex.test(last)) this.push(last);
cb();
});
stream._readableState.objectMode = true;
return stream;
}
module.exports = grep;
and
// index.js
'use strict';
var fs = require('fs');
var zlib = require('zlib');
var grep = require('./linegrep');
function grepFile(filename, regex) {
var rstream = fs.createReadStream(filename, {highWaterMark: 172 * 1024});
if (/\.gz$/.test(filename)) rstream = rstream.pipe(zlib.createGunzip());
return rstream
.pipe(grep(regex));
}
// -------------------------------------------------------------------------
var t = Date.now(), mc = 0;
grepFile('input.txt', /boot\.([a-z]+)_head\./).on('data', function (line) {
mc++;
console.log(line);
}).on('end', function () {
console.log( mc + " matches, " + (Date.now() - t) + " ms" );
});
This turns a file stream into an object stream of lines, maps them through your regex and returns only the matching lines.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With