I have already used 'pdf.js-extract' npm module to fetch data from pdf.
var PDFExtract = require('pdf.js-extract').PDFExtract;
var pdfExtract = new PDFExtract();
var filename="/home/aman/Downloads/sample_invoice.pdf"
pdfExtract.extract(filename , function (err, data) {
if (err) return console.log(err);
console.log(JSON.stringify(data));
});
But I am not getting the desired result. I want to fetch the relevant information from invoice pdf like tax, total amount paid, seller address and save the data fetched into the mongodb collection
Once you've opened the file, click on the "Edit" tab, and then click on the "edit" icon. Now you can right-click on the text and select "Copy" to extract the text you need.
You must write a function by invoice format (fn company1, fn company2...).
Here is an example with three different functions to retrieve data in the export of the pdf.js-extract
module:
// Sample invoice
let sampleInvoice =
{
"pages":
[
{
"content":
[
{
"x": 348.41,
"y": 125.59899999999993,
"str": "Invoice Number",
"dir": "ltr",
"width": 61.61760000000001,
"height": 8.8,
"fontName": "g_d0_f2"
},
{
"x": 451.935,
"y": 125.59899999999993,
"str": "INV-3337",
"dir": "ltr",
"width": 37.171200000000006,
"height": 8.8,
"fontName": "g_d0_f2"
}
]
}
]
};
// Create alerts for test functions in browser
alert(searchByPosition(sampleInvoice.pages, 450, 125));
alert(searchByPrev(sampleInvoice.pages, 'Invoice Number'));
alert(searchByFormat(sampleInvoice.pages, /INV-\d+$/));
function searchByPosition(pages,x,y)
{
// Set position range (difference max)
let range = 10;
// Init x and y positions
x = Math.floor(x/range), y = Math.floor(y/range);
// Loop in all pages
for(let i = 0; i < pages.length; i++)
// Loop in all content
for(let j = 0; j < pages[i].content.length; j++)
// Test position x and y and if match return content
if(Math.floor(pages[i].content[j].x/range) == x && Math.floor(pages[i].content[j].y/range) == y)
// Return result
return pages[i].content[j].str;
// No results found
return 'NotFound';
}
function searchByPrev(pages,txt)
{
// Init txt
txt = txt.toLowerCase();
// Loop in all pages
for(let i = 0; i < pages.length; i++)
// Loop in all content
for(let j = 0; j < pages[i].content.length; j++)
// Test text and if match return next content
// (If you write j-1, you can have searchByNext function)
if(pages[i].content[j].str.toLowerCase() == txt && pages[i].content[j+1])
// Return result
return pages[i].content[j+1].str;
// No results found
return 'NotFound';
}
function searchByFormat(pages,regex)
{
// Loop in all pages
for(let i = 0; i < pages.length; i++)
// Loop in all content
for(let j = 0; j < pages[i].content.length; j++)
// Test regex and if match return content
if(regex.test(pages[i].content[j].str))
// Return result
return pages[i].content[j].str;
// No results found
return 'NotFound';
}
TRY HERE : https://jsfiddle.net/dkhqzg6s/
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With