I have already used 'pdf.js-extract' npm module to fetch data from pdf.
var PDFExtract = require('pdf.js-extract').PDFExtract;
var pdfExtract = new PDFExtract();
var filename="/home/aman/Downloads/sample_invoice.pdf"
pdfExtract.extract(filename , function (err, data) {
    if (err) return console.log(err);
    console.log(JSON.stringify(data));
});
But I am not getting the desired result. I want to fetch the relevant information from invoice pdf like tax, total amount paid, seller address and save the data fetched into the mongodb collection
Once you've opened the file, click on the "Edit" tab, and then click on the "edit" icon. Now you can right-click on the text and select "Copy" to extract the text you need.
You must write a function by invoice format (fn company1, fn company2...).
Here is an example with three different functions to retrieve data in the export of the pdf.js-extract module:
// Sample invoice
let sampleInvoice =
{
  "pages":
  [
    {
      "content":
      [
        {
          "x": 348.41,
          "y": 125.59899999999993,
          "str": "Invoice Number",
          "dir": "ltr",
          "width": 61.61760000000001,
          "height": 8.8,
          "fontName": "g_d0_f2"
        },
        {
          "x": 451.935,
          "y": 125.59899999999993,
          "str": "INV-3337",
          "dir": "ltr",
          "width": 37.171200000000006,
          "height": 8.8,
          "fontName": "g_d0_f2"
        }
      ]
    }
  ]
};
// Create alerts for test functions in browser
alert(searchByPosition(sampleInvoice.pages, 450, 125));
alert(searchByPrev(sampleInvoice.pages, 'Invoice Number'));
alert(searchByFormat(sampleInvoice.pages, /INV-\d+$/));
function searchByPosition(pages,x,y)
{
    // Set position range (difference max)
    let range = 10;
    // Init x and y positions
    x = Math.floor(x/range), y = Math.floor(y/range);
    // Loop in all pages
    for(let i = 0; i < pages.length; i++)
        // Loop in all content
        for(let j = 0; j < pages[i].content.length; j++)
            // Test position x and y and if match return content
            if(Math.floor(pages[i].content[j].x/range) == x && Math.floor(pages[i].content[j].y/range) == y)
                // Return result
                return pages[i].content[j].str;
    // No results found
    return 'NotFound';
}
function searchByPrev(pages,txt)
{
    // Init txt
    txt = txt.toLowerCase();
    // Loop in all pages
    for(let i = 0; i < pages.length; i++)
        // Loop in all content
        for(let j = 0; j < pages[i].content.length; j++)
            // Test text  and if match return next content
            // (If you write j-1, you can have searchByNext function)
            if(pages[i].content[j].str.toLowerCase() == txt && pages[i].content[j+1])
                // Return result
                return pages[i].content[j+1].str;
    // No results found
    return 'NotFound';
}
function searchByFormat(pages,regex)
{
    // Loop in all pages
    for(let i = 0; i < pages.length; i++)
        // Loop in all content
        for(let j = 0; j < pages[i].content.length; j++)
            // Test regex and if match return content
            if(regex.test(pages[i].content[j].str))
                // Return result
                return pages[i].content[j].str;
    // No results found
    return 'NotFound';
}
TRY HERE : https://jsfiddle.net/dkhqzg6s/
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With