I have a simple pdf file, containing the words "Hello world", each in a different colour.
I'm loading the PDF, like this:
PDFJS.getDocument('test.pdf').then( onPDF );
function onPDF( pdf )
{
    pdf.getPage( 1 ).then( onPage );
}
function onPage( page )
{
    page.getTextContent().then( onText );
}
function onText( text )
{   
    console.log( JSON.stringify( text ) );
}
And I get a JSON output like this:
{
    "items" : [{
            "str" : "Hello ",
            "dir" : "ltr",
            "width" : 29.592,
            "height" : 12,
            "transform" : [12, 0, 0, 12, 56.8, 774.1],
            "fontName" : "g_font_1"
        }, {
            "str" : "world",
            "dir" : "ltr",
            "width" : 27.983999999999998,
            "height" : 12,
            "transform" : [12, 0, 0, 12, 86.5, 774.1],
            "fontName" : "g_font_1"
        }
    ],
    "styles" : {
        "g_font_1" : {
            "fontFamily" : "serif",
            "ascent" : 0.891,
            "descent" : 0.216
        }
    }
}
However, I've not been able to find a way to determine the colour of each word. When I render it, it renders properly, so I know the information is in there somewhere. Is there somewhere I can access this?
As Respawned alluded to, there is no easy answer that will work in all cases. That being said, here are two approaches which seem to work fairly well. Both having upsides and downsides.
Internally, the getTextContent method uses whats called an EvaluatorPreprocessor to parse the PDF operators, and maintain the graphic state. So what we can do is, implement a custom EvaluatorPreprocessor, overwrite the preprocessCommand method, and use it to add the current text color to the graphic state. Once this is in place, anytime a new text chunk is created, we can add a color attribute, and set it to the current color state.
The downsides to this approach are:
Requires modifying the PDFJS source code. It also depends heavily on the current implementation of PDFJS, and could break if this is changed.
It will fail in cases where the text is used as a path to be filled with an image. In some PDF creators (such as Photoshop), the way it creates colored text is, it first creates a clipping path from all the given text characters, and then paints a solid image over the path. So the only way to deduce the fill-color is by reading the pixel values from the image, which would require painting it to a canvas. Even hooking into paintChar wont be of much help here, since the fill color will only emerge at a later time.
The upside is, its fairly robust and works irrespective of the page background. It also does not require rendering anything to canvas, so it can be done entirely in the background thread.
Code
All the modifications are made in the core/evaluator.js file.
First you must define the custom evaluator, after the EvaluatorPreprocessor definition.
var CustomEvaluatorPreprocessor = (function() {
    function CustomEvaluatorPreprocessor(stream, xref, stateManager, resources) {
        EvaluatorPreprocessor.call(this, stream, xref, stateManager);
        this.resources = resources;
        this.xref = xref;
        // set initial color state
        var state = this.stateManager.state;
        state.textRenderingMode = TextRenderingMode.FILL;
        state.fillColorSpace = ColorSpace.singletons.gray;
        state.fillColor = [0,0,0];
    }
    CustomEvaluatorPreprocessor.prototype = Object.create(EvaluatorPreprocessor.prototype);
    CustomEvaluatorPreprocessor.prototype.preprocessCommand = function(fn, args) {
        EvaluatorPreprocessor.prototype.preprocessCommand.call(this, fn, args);
        var state = this.stateManager.state;
        switch(fn) {
            case OPS.setFillColorSpace:
                state.fillColorSpace = ColorSpace.parse(args[0], this.xref, this.resources);
            break;
            case OPS.setFillColor:
                 var cs = state.fillColorSpace;
                 state.fillColor = cs.getRgb(args, 0);
            break;
            case OPS.setFillGray:
              state.fillColorSpace = ColorSpace.singletons.gray;
              state.fillColor = ColorSpace.singletons.gray.getRgb(args, 0);
            break;
            case OPS.setFillCMYKColor:
              state.fillColorSpace = ColorSpace.singletons.cmyk;
              state.fillColor = ColorSpace.singletons.cmyk.getRgb(args, 0);
            break;
            case OPS.setFillRGBColor:
                state.fillColorSpace = ColorSpace.singletons.rgb;
                state.fillColor = ColorSpace.singletons.rgb.getRgb(args, 0);
            break;
        }
    };
    return CustomEvaluatorPreprocessor;
})();
Next, you need to modify the getTextContent method to use the new evaluator:
var preprocessor = new CustomEvaluatorPreprocessor(stream, xref, stateManager, resources);
And lastly, in the newTextChunk method, add a color attribute:
color: stateManager.state.fillColor
Another approach would be to extract the text bounding boxes via getTextContent, render the page, and for each text, get the pixel values which reside within its bounds, and take that to be the fill color.
The downsides to this approach are:
The upside is, its simple, and does not require messing with the PDFJS source-code. Also, it will work in cases where the text is used as a clipping path, and filled with an image. Though this can become hazy when you have complex image fills, in which case, the choice of text color becomes ambiguous.
Demo
http://jsfiddle.net/x2rajt5g/
Sample PDF's to test:
Code
function parseColors(canvasImgData, texts) {
    var data = canvasImgData.data,
        width = canvasImgData.width,
        height = canvasImgData.height,
        defaultColor = [0, 0, 0],
        minVariance = 20;
    texts.forEach(function (t) {
        var left = Math.floor(t.transform[4]),
            w = Math.round(t.width),
            h = Math.round(t.height),
            bottom = Math.round(height - t.transform[5]),
            top = bottom - h,
            start = (left + (top * width)) * 4,
            color = [],
            best = Infinity,
            stat = new ImageStats();
        for (var i, v, row = 0; row < h; row++) {
            i = start + (row * width * 4);
            for (var col = 0; col < w; col++) {
                if ((v = data[i] + data[i + 1] + data[i + 2]) < best) { // the darker the "better"
                    best = v;
                    color[0] = data[i];
                    color[1] = data[i + 1];
                    color[2] = data[i + 2];
                }
                stat.addPixel(data[i], data[i+1], data[i+2]);
                i += 4;
            }
        }
        var stdDev = stat.getStdDev();
        t.color = stdDev < minVariance ? defaultColor : color;
    });
}
function ImageStats() {
    this.pixelCount = 0;
    this.pixels = [];
    this.rgb = [];
    this.mean = 0;
    this.stdDev = 0;
}
ImageStats.prototype = {
    addPixel: function (r, g, b) {
        if (!this.rgb.length) {
            this.rgb[0] = r;
            this.rgb[1] = g;
            this.rgb[2] = b;
        } else {
            this.rgb[0] += r;
            this.rgb[1] += g;
            this.rgb[2] += b;
        }
        this.pixelCount++;
        this.pixels.push([r,g,b]);
    },
    getStdDev: function() {
        var mean = [
            this.rgb[0] / this.pixelCount,
            this.rgb[1] / this.pixelCount,
            this.rgb[2] / this.pixelCount
        ];
        var diff = [0,0,0];
        this.pixels.forEach(function(p) {
            diff[0] += Math.pow(mean[0] - p[0], 2);
            diff[1] += Math.pow(mean[1] - p[1], 2);
            diff[2] += Math.pow(mean[2] - p[2], 2);
        });
        diff[0] = Math.sqrt(diff[0] / this.pixelCount);
        diff[1] = Math.sqrt(diff[1] / this.pixelCount);
        diff[2] = Math.sqrt(diff[2] / this.pixelCount);
        return diff[0] + diff[1] + diff[2];
    }
};
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With