Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Download xlsx from S3 and parse it

I need a service to download an excel file from Amazon S3, then parse with node-xlsx

The problem is that I can't get xlsx to parse the file. When I try to read back the file I just wrote, it isn't found by the code.

I'm not quite sure if this is the best approach, but this is what I got so far:

router.get('/process', (req, res) => {
    var fileName = 'https://some-bucket.s3.amazonaws.com/some-excel-file.xlsx'
    https.get(fileName, response => {
        var body = ''
        response.on('data', chunk => body += chunk)
        response.on('end', () => {

            //fs is being imported early on this file
            fs.writeFile(__dirname + '/test.xlsx', body)

            var f = fs.createReadStream(__dirname + '/test.xlsx')

            var book = xlsx.parse(f)
            book.forEach(sheet => console.log('sheet', sheet.name) )

            res.status(200)          
        })
        .on('error', e => {
            res.status(500)
        })
    })
    return
})
like image 497
Ramses Avatar asked Nov 10 '16 19:11

Ramses


4 Answers

This is how you can read a file from S3 nodejs and keep it in memory without first writing the file to some location on disk. It can be used with a combination of S3 and AWS Lambda so that you don't have to write the files to some location on the Lambda.

Remember this processes is asynchronous.

   var params = {
        Bucket: "",
        Key: ""
    };

    var file = s3.getObject(params).createReadStream();
    var buffers = [];

    file.on('data', function (data) {
        buffers.push(data);
    });

    file.on('end', function () {
        var buffer = Buffer.concat(buffers);
        var workbook = xlsx.parse(buffer);
        console.log("workbook", workbook)
    });
like image 167
David Webster Avatar answered Oct 17 '22 16:10

David Webster


If you want use async/wait, here solution:

    const AWS = require('aws-sdk');
    const XLSX = require('xlsx');

    AWS.config.update({
      accessKeyId: AMAZON_ACCESS_KEY,
      secretAccessKey: AMAZON_SECRET_ACCESS_KEY,
    });

    // Get buffered file from s3    
    function getBufferFromS3(file, callback){
      const buffers = [];
      const s3 = new AWS.S3();
      const stream = s3.getObject({ Bucket: 'yor_buket', Key: file}).createReadStream();
      stream.on('data', data => buffers.push(data));
      stream.on('end', () => callback(null, Buffer.concat(buffers)));
      stream.on('error', error => callback(error));
    }

    // promisify read stream from s3
    function getBufferFromS3Promise(file) {
      return new Promise((resolve, reject) => {
        getBufferFromS3(file, (error, s3buffer) => {
          if (error) return reject(error);
          return resolve(s3buffer);
        });
      });
    };

    // create workbook from buffer
    const buffer = await getBufferFromS3Promise(file);
    const workbook = XLSX.read(buffer);


    // If you want to send the workbook as a download to the api end point in node
    const fileName = "Categories.xlsx";
    res.setHeader('Content-disposition', 'attachment; filename=' + fileName);
    res.setHeader('Content-type', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet');
    const wbout = XLSX.write(workbook, { bookType: 'xlsx', type: 'buffer'});
    res.send(new Buffer(wbout));
like image 26
Den Rupp Avatar answered Oct 17 '22 16:10

Den Rupp


fs.writeFile is asynchronous. The file won't be there till the call back is called.

https://nodejs.org/api/fs.html#fs_fs_writefile_file_data_options_callback

fs.writeFile('message.txt', 'Hello Node.js', (err) => {
  if (err) throw err;
  console.log('It\'s saved!');
});
like image 25
Gabriel Littman Avatar answered Oct 17 '22 15:10

Gabriel Littman


The node-xlsx module requires that the entire xlsx buffer be available. So you cannot pass it a ReadStream like you're currently doing. Try this method which entirely avoids writing to disk:

router.get('/process', (req, res) => {
    var fileName = 'https://some-bucket.s3.amazonaws.com/some-excel-file.xlsx'
    https.get(fileName, response => {
        var chunks = []
        response.on('data', chunk => chunks.push(chunk))
        .on('end', () => {
            var book = xlsx.parse(Buffer.concat(chunks))
            book.forEach(sheet => console.log('sheet', sheet.name))
            res.status(200)          
        })
        .on('error', e => {
            res.status(500)
        })
    })
    return
})
like image 4
idbehold Avatar answered Oct 17 '22 15:10

idbehold