Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Troubles with pdf.js promises

Tags:

promise

pdf.js

I'm trying to implement a pdf word count in Javascript. I came across pdf.js which uses promises. Is there a way to wait till the script is done before returning the count? I know that this goes against the idea of promises, but the other js pdf readers out there either sometimes produce a bunch of gibberish or return nothing. In its current form the function always return a word count of 0.

function countWords(pdfUrl){
var pdf = PDFJS.getDocument(pdfUrl);
var count = 0;
pdf.then(function(pdf) {
     var maxPages = pdf.pdfInfo.numPages;
     for (var j = 1; j <= maxPages; j++) {
        var page = pdf.getPage(j);

        var txt = "";
        page.then(function(page) {
            var textContent = page.getTextContent();
            textContent.then(function(page){

            for(var i=0;i<page.items.length;i++){
                txtadd = page.items[i].str
                txt += txtadd.replace(/[^a-zA-Z0-9:;,.?!-() ]/g,'');
            }
                count = count + txt.split(" ").length;

            })
        })
     }
     return count;
});

}

like image 982
Joe Harrison Avatar asked Nov 08 '16 08:11

Joe Harrison


1 Answers

Promises cannot be handled in sync manner. The countWords cannot return value immediately and has to wait on inner promises (one for document and multiple for pages and text contexts) to be resolved. So countWords must return a Promise or accept callback. Best way is try to return and chain then() calls. When needed to join resolution use Promise.all:

function countWords(pdfUrl){
var pdf = PDFJS.getDocument(pdfUrl);
return pdf.then(function(pdf) { // calculate total count for document
     var maxPages = pdf.pdfInfo.numPages;
     var countPromises = []; // collecting all page promises
     for (var j = 1; j <= maxPages; j++) {
        var page = pdf.getPage(j);

        var txt = "";
        countPromises.push(page.then(function(page) { // add page promise
            var textContent = page.getTextContent();
            return textContent.then(function(page){ // return content promise

            for(var i=0;i<page.items.length;i++){
                txtadd = page.items[i].str
                txt += txtadd.replace(/[^a-zA-Z0-9:;,.?!-() ]/g,'');
            }
                return txt.split(" ").length; // value for page words

            });
        }));
     }
     // Wait for all pages and sum counts
     return Promise.all(countPromises).then(function (counts) {
       var count = 0;
       counts.forEach(function (c) { count += c; });
       return count;
     });
});
}
// waiting on countWords to finish completion, or error
countWords("https://cdn.mozilla.net/pdfjs/tracemonkey.pdf").then(function (count) {
  alert(count);
}, function (reason) {
  console.error(reason);
});
<script src="https://npmcdn.com/pdfjs-dist/build/pdf.js"></script>
like image 108
async5 Avatar answered Sep 28 '22 03:09

async5