Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

pdf.js: Get the text colour

I have a simple pdf file, containing the words "Hello world", each in a different colour.

I'm loading the PDF, like this:

PDFJS.getDocument('test.pdf').then( onPDF );

function onPDF( pdf )
{
    pdf.getPage( 1 ).then( onPage );
}

function onPage( page )
{
    page.getTextContent().then( onText );
}

function onText( text )
{   
    console.log( JSON.stringify( text ) );
}

And I get a JSON output like this:

{
    "items" : [{
            "str" : "Hello ",
            "dir" : "ltr",
            "width" : 29.592,
            "height" : 12,
            "transform" : [12, 0, 0, 12, 56.8, 774.1],
            "fontName" : "g_font_1"
        }, {
            "str" : "world",
            "dir" : "ltr",
            "width" : 27.983999999999998,
            "height" : 12,
            "transform" : [12, 0, 0, 12, 86.5, 774.1],
            "fontName" : "g_font_1"
        }
    ],
    "styles" : {
        "g_font_1" : {
            "fontFamily" : "serif",
            "ascent" : 0.891,
            "descent" : 0.216
        }
    }
}

However, I've not been able to find a way to determine the colour of each word. When I render it, it renders properly, so I know the information is in there somewhere. Is there somewhere I can access this?

like image 395
divillysausages Avatar asked Dec 30 '14 22:12

divillysausages


1 Answers

As Respawned alluded to, there is no easy answer that will work in all cases. That being said, here are two approaches which seem to work fairly well. Both having upsides and downsides.

Approach 1

Internally, the getTextContent method uses whats called an EvaluatorPreprocessor to parse the PDF operators, and maintain the graphic state. So what we can do is, implement a custom EvaluatorPreprocessor, overwrite the preprocessCommand method, and use it to add the current text color to the graphic state. Once this is in place, anytime a new text chunk is created, we can add a color attribute, and set it to the current color state.

The downsides to this approach are:

  1. Requires modifying the PDFJS source code. It also depends heavily on the current implementation of PDFJS, and could break if this is changed.

  2. It will fail in cases where the text is used as a path to be filled with an image. In some PDF creators (such as Photoshop), the way it creates colored text is, it first creates a clipping path from all the given text characters, and then paints a solid image over the path. So the only way to deduce the fill-color is by reading the pixel values from the image, which would require painting it to a canvas. Even hooking into paintChar wont be of much help here, since the fill color will only emerge at a later time.

The upside is, its fairly robust and works irrespective of the page background. It also does not require rendering anything to canvas, so it can be done entirely in the background thread.

Code

All the modifications are made in the core/evaluator.js file.

First you must define the custom evaluator, after the EvaluatorPreprocessor definition.

var CustomEvaluatorPreprocessor = (function() {
    function CustomEvaluatorPreprocessor(stream, xref, stateManager, resources) {
        EvaluatorPreprocessor.call(this, stream, xref, stateManager);
        this.resources = resources;
        this.xref = xref;

        // set initial color state
        var state = this.stateManager.state;
        state.textRenderingMode = TextRenderingMode.FILL;
        state.fillColorSpace = ColorSpace.singletons.gray;
        state.fillColor = [0,0,0];
    }

    CustomEvaluatorPreprocessor.prototype = Object.create(EvaluatorPreprocessor.prototype);

    CustomEvaluatorPreprocessor.prototype.preprocessCommand = function(fn, args) {
        EvaluatorPreprocessor.prototype.preprocessCommand.call(this, fn, args);
        var state = this.stateManager.state;
        switch(fn) {
            case OPS.setFillColorSpace:
                state.fillColorSpace = ColorSpace.parse(args[0], this.xref, this.resources);
            break;
            case OPS.setFillColor:
                 var cs = state.fillColorSpace;
                 state.fillColor = cs.getRgb(args, 0);
            break;
            case OPS.setFillGray:
              state.fillColorSpace = ColorSpace.singletons.gray;
              state.fillColor = ColorSpace.singletons.gray.getRgb(args, 0);
            break;
            case OPS.setFillCMYKColor:
              state.fillColorSpace = ColorSpace.singletons.cmyk;
              state.fillColor = ColorSpace.singletons.cmyk.getRgb(args, 0);
            break;
            case OPS.setFillRGBColor:
                state.fillColorSpace = ColorSpace.singletons.rgb;
                state.fillColor = ColorSpace.singletons.rgb.getRgb(args, 0);
            break;
        }
    };

    return CustomEvaluatorPreprocessor;
})();

Next, you need to modify the getTextContent method to use the new evaluator:

var preprocessor = new CustomEvaluatorPreprocessor(stream, xref, stateManager, resources);

And lastly, in the newTextChunk method, add a color attribute:

color: stateManager.state.fillColor

Approach 2

Another approach would be to extract the text bounding boxes via getTextContent, render the page, and for each text, get the pixel values which reside within its bounds, and take that to be the fill color.

The downsides to this approach are:

  1. The computed text bounding boxes are not always correct, and in some cases may even be off completely (eg: rotated text). If the bounding box does not cover at least partially the actual text on canvas, then this method will fail. We can recover from complete failures, by checking that the text pixels have a color variance greater than a threshold. The rationale being, if bounding box is completely background, it will have little variance, in which case we can fallback to a default text color (or maybe even the color of k nearest-neighbors).
  2. The method assumes the text is darker than the background. Otherwise, the background could be mistaken as the fill color. This wont be a problem is most cases, as most docs have white backgrounds.

The upside is, its simple, and does not require messing with the PDFJS source-code. Also, it will work in cases where the text is used as a clipping path, and filled with an image. Though this can become hazy when you have complex image fills, in which case, the choice of text color becomes ambiguous.

Demo

http://jsfiddle.net/x2rajt5g/

Sample PDF's to test:

  • https://www.dropbox.com/s/0t5vtu6qqsdm1d4/color-test.pdf?dl=1
  • https://www.dropbox.com/s/cq0067u80o79o7x/testTextColour.pdf?dl=1

Code

function parseColors(canvasImgData, texts) {
    var data = canvasImgData.data,
        width = canvasImgData.width,
        height = canvasImgData.height,
        defaultColor = [0, 0, 0],
        minVariance = 20;

    texts.forEach(function (t) {
        var left = Math.floor(t.transform[4]),
            w = Math.round(t.width),
            h = Math.round(t.height),
            bottom = Math.round(height - t.transform[5]),
            top = bottom - h,
            start = (left + (top * width)) * 4,
            color = [],
            best = Infinity,
            stat = new ImageStats();

        for (var i, v, row = 0; row < h; row++) {
            i = start + (row * width * 4);
            for (var col = 0; col < w; col++) {
                if ((v = data[i] + data[i + 1] + data[i + 2]) < best) { // the darker the "better"
                    best = v;
                    color[0] = data[i];
                    color[1] = data[i + 1];
                    color[2] = data[i + 2];
                }
                stat.addPixel(data[i], data[i+1], data[i+2]);
                i += 4;
            }
        }
        var stdDev = stat.getStdDev();
        t.color = stdDev < minVariance ? defaultColor : color;
    });
}

function ImageStats() {
    this.pixelCount = 0;
    this.pixels = [];
    this.rgb = [];
    this.mean = 0;
    this.stdDev = 0;
}

ImageStats.prototype = {
    addPixel: function (r, g, b) {
        if (!this.rgb.length) {
            this.rgb[0] = r;
            this.rgb[1] = g;
            this.rgb[2] = b;
        } else {
            this.rgb[0] += r;
            this.rgb[1] += g;
            this.rgb[2] += b;
        }
        this.pixelCount++;
        this.pixels.push([r,g,b]);
    },

    getStdDev: function() {
        var mean = [
            this.rgb[0] / this.pixelCount,
            this.rgb[1] / this.pixelCount,
            this.rgb[2] / this.pixelCount
        ];
        var diff = [0,0,0];
        this.pixels.forEach(function(p) {
            diff[0] += Math.pow(mean[0] - p[0], 2);
            diff[1] += Math.pow(mean[1] - p[1], 2);
            diff[2] += Math.pow(mean[2] - p[2], 2);
        });
        diff[0] = Math.sqrt(diff[0] / this.pixelCount);
        diff[1] = Math.sqrt(diff[1] / this.pixelCount);
        diff[2] = Math.sqrt(diff[2] / this.pixelCount);
        return diff[0] + diff[1] + diff[2];
    }
};
like image 90
levi Avatar answered Sep 19 '22 11:09

levi