I want to download a zip file from the internet and unzip it in memory without saving to a temporary file. How can I do this?
Here is what I tried:
var url = 'http://bdn-ak.bloomberg.com/precanned/Comdty_Calendar_Spread_Option_20120428.txt.zip'; var request = require('request'), fs = require('fs'), zlib = require('zlib'); request.get(url, function(err, res, file) { if(err) throw err; zlib.unzip(file, function(err, txt) { if(err) throw err; console.log(txt.toString()); //outputs nothing }); });
[EDIT] As, suggested, I tried using the adm-zip library and I still cannot make this work:
var ZipEntry = require('adm-zip/zipEntry'); request.get(url, function(err, res, zipFile) { if(err) throw err; var zip = new ZipEntry(); zip.setCompressedData(new Buffer(zipFile.toString('utf-8'))); var text = zip.getData(); console.log(text.toString()); // fails });
You can simply extract the existing zip files also by using "unzip". It will work for any size files and you need to add it as a dependency from npm. Please explain what unzip is and if it's a built in package or requires you to npm install it.
In Node. js, you can use the adm-zip module to create and read ZIP archives. In this tutorial, you will use adm-zip module to compress, read, and decompress files. First, you'll combine multiple files into a ZIP archive using adm-zip .
You can then write the file using one of two methods: either by converting the zip file to a Node. js buffer using toBuffer() , by using file. writeZip() . // One way to write the zip file: convert it to a buffer and use `fs` const fs = require('fs'); fs.
You need a library that can handle buffers. The latest version of adm-zip
will do:
npm install adm-zip
My solution uses the http.get
method, since it returns Buffer chunks.
Code:
var file_url = 'http://notepad-plus-plus.org/repository/7.x/7.6/npp.7.6.bin.x64.zip'; var AdmZip = require('adm-zip'); var http = require('http'); http.get(file_url, function(res) { var data = [], dataLen = 0; res.on('data', function(chunk) { data.push(chunk); dataLen += chunk.length; }).on('end', function() { var buf = Buffer.alloc(dataLen); for (var i = 0, len = data.length, pos = 0; i < len; i++) { data[i].copy(buf, pos); pos += data[i].length; } var zip = new AdmZip(buf); var zipEntries = zip.getEntries(); console.log(zipEntries.length) for (var i = 0; i < zipEntries.length; i++) { if (zipEntries[i].entryName.match(/readme/)) console.log(zip.readAsText(zipEntries[i])); } }); });
The idea is to create an array of buffers and concatenate them into a new one at the end. This is due to the fact that buffers cannot be resized.
Update
This is a simpler solution that uses the request
module to obtain the response in a buffer, by setting encoding: null
in the options. It also follows redirects and resolves http/https automatically.
var file_url = 'https://github.com/mihaifm/linq/releases/download/3.1.1/linq.js-3.1.1.zip'; var AdmZip = require('adm-zip'); var request = require('request'); request.get({url: file_url, encoding: null}, (err, res, body) => { var zip = new AdmZip(body); var zipEntries = zip.getEntries(); console.log(zipEntries.length); zipEntries.forEach((entry) => { if (entry.entryName.match(/readme/i)) console.log(zip.readAsText(entry)); }); });
The body
of the response is a buffer that can be passed directly to AdmZip
, simplifying the whole process.
Sadly you can't pipe the response stream into the unzip job as node zlib
lib allows you to do, you have to cache and wait the end of the response. I suggest you to pipe the response to a fs
stream in case of big files, otherwise you will full fill your memory in a blink!
I don't completely understand what you are trying to do, but imho this is the best approach. You should keep your data in memory only the time you really need it, and then stream to the csv parser.
If you want to keep all your data in memory you can replace the csv parser method fromPath
with from
that takes a buffer instead and in getData return directly unzipped
You can use the AMDZip
(as @mihai said) instead of node-zip
, just pay attention because AMDZip
is not yet published in npm so you need:
$ npm install git://github.com/cthackers/adm-zip.git
N.B. Assumption: the zip file contains only one file
var request = require('request'), fs = require('fs'), csv = require('csv') NodeZip = require('node-zip') function getData(tmpFolder, url, callback) { var tempZipFilePath = tmpFolder + new Date().getTime() + Math.random() var tempZipFileStream = fs.createWriteStream(tempZipFilePath) request.get({ url: url, encoding: null }).on('end', function() { fs.readFile(tempZipFilePath, 'base64', function (err, zipContent) { var zip = new NodeZip(zipContent, { base64: true }) Object.keys(zip.files).forEach(function (filename) { var tempFilePath = tmpFolder + new Date().getTime() + Math.random() var unzipped = zip.files[filename].data fs.writeFile(tempFilePath, unzipped, function (err) { callback(err, tempFilePath) }) }) }) }).pipe(tempZipFileStream) } getData('/tmp/', 'http://bdn-ak.bloomberg.com/precanned/Comdty_Calendar_Spread_Option_20120428.txt.zip', function (err, path) { if (err) { return console.error('error: %s' + err.message) } var metadata = [] csv().fromPath(path, { delimiter: '|', columns: true }).transform(function (data){ // do things with your data if (data.NAME[0] === '#') { metadata.push(data.NAME) } else { return data } }).on('data', function (data, index) { console.log('#%d %s', index, JSON.stringify(data, null, ' ')) }).on('end',function (count) { console.log('Metadata: %s', JSON.stringify(metadata, null, ' ')) console.log('Number of lines: %d', count) }).on('error', function (error) { console.error('csv parsing error: %s', error.message) }) })
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With