I'm trying to implement a pdf word count in Javascript. I came across pdf.js which uses promises. Is there a way to wait till the script is done before returning the count? I know that this goes against the idea of promises, but the other js pdf readers out there either sometimes produce a bunch of gibberish or return nothing. In its current form the function always return a word count of 0.
function countWords(pdfUrl){
var pdf = PDFJS.getDocument(pdfUrl);
var count = 0;
pdf.then(function(pdf) {
var maxPages = pdf.pdfInfo.numPages;
for (var j = 1; j <= maxPages; j++) {
var page = pdf.getPage(j);
var txt = "";
page.then(function(page) {
var textContent = page.getTextContent();
textContent.then(function(page){
for(var i=0;i<page.items.length;i++){
txtadd = page.items[i].str
txt += txtadd.replace(/[^a-zA-Z0-9:;,.?!-() ]/g,'');
}
count = count + txt.split(" ").length;
})
})
}
return count;
});
}
Promises cannot be handled in sync manner. The countWords cannot return value immediately and has to wait on inner promises (one for document and multiple for pages and text contexts) to be resolved. So countWords must return a Promise or accept callback. Best way is try to return and chain then() calls. When needed to join resolution use Promise.all:
function countWords(pdfUrl){
var pdf = PDFJS.getDocument(pdfUrl);
return pdf.then(function(pdf) { // calculate total count for document
var maxPages = pdf.pdfInfo.numPages;
var countPromises = []; // collecting all page promises
for (var j = 1; j <= maxPages; j++) {
var page = pdf.getPage(j);
var txt = "";
countPromises.push(page.then(function(page) { // add page promise
var textContent = page.getTextContent();
return textContent.then(function(page){ // return content promise
for(var i=0;i<page.items.length;i++){
txtadd = page.items[i].str
txt += txtadd.replace(/[^a-zA-Z0-9:;,.?!-() ]/g,'');
}
return txt.split(" ").length; // value for page words
});
}));
}
// Wait for all pages and sum counts
return Promise.all(countPromises).then(function (counts) {
var count = 0;
counts.forEach(function (c) { count += c; });
return count;
});
});
}
// waiting on countWords to finish completion, or error
countWords("https://cdn.mozilla.net/pdfjs/tracemonkey.pdf").then(function (count) {
alert(count);
}, function (reason) {
console.error(reason);
});
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With