Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Is there a matplotlib flowable for ReportLab?

I want to embed matplotlib charts into PDFs generated by ReportLab directly - i.e. not saving as a PNG first and then embedding the PNG into the PDF (i think I'll get better quality output).

Does anyone know if there's a matplotlib flowable for ReportLab?

Thanks

like image 383
jeff Avatar asked Jan 14 '11 11:01

jeff


People also ask

What is Flowable reportlab?

Reportlab is a very flexible PDF creation package for Python. You can layout your documents using absolute positioning or by using Flowable objects, such as a Paragraph, a Table or Frame. You can even mix the two together!


1 Answers

Patrick Maupin, the author of pdfrw, provided a simpler, less complicated answer in another question. (My thanks for his kind words about my previous answer.) He also mentioned that saving matplotlib figures to a multi-page PDF before using pdfrw to extract them would reduce the size of the final reportlab PDF by reducing duplicate resources. So here's a modification of his code example that demonstrates how the PDF file size is reduced by writing to a multi-page matplotlib PDF first. For this example, the file size is reduced by about 80%.

Note: This is specialized for use with matplotlib figures.

import os
from matplotlib import pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from reportlab.platypus import Paragraph, SimpleDocTemplate, Spacer, Flowable
from reportlab.lib.units import inch
from reportlab.lib.styles import getSampleStyleSheet

from pdfrw import PdfReader, PdfDict
from pdfrw.buildxobj import pagexobj
from pdfrw.toreportlab import makerl

try:
    from cStringIO import StringIO as BytesIO
except ImportError:
    from io import BytesIO

styles = getSampleStyleSheet()
style = styles['Normal']


class PdfImage(Flowable):
    """
    Generates a reportlab image flowable for matplotlib figures. It is initialized
    with either a matplotlib figure or a pointer to a list of pagexobj objects and
    an index for the pagexobj to be used.
    """
    def __init__(self, fig=None, width=200, height=200, cache=None, cacheindex=0):
        self.img_width = width
        self.img_height = height
        if fig is None and cache is None:
            raise ValueError("Either 'fig' or 'cache' must be provided")
        if fig is not None:
            imgdata = BytesIO()
            fig.savefig(imgdata, format='pdf')
            imgdata.seek(0)
            page, = PdfReader(imgdata).pages
            image = pagexobj(page)
            self.img_data = image
        else:
            self.img_data = None
        self.cache = cache
        self.cacheindex = cacheindex

    def wrap(self, width, height):
        return self.img_width, self.img_height

    def drawOn(self, canv, x, y, _sW=0):
        if _sW > 0 and hasattr(self, 'hAlign'):
            a = self.hAlign
            if a in ('CENTER', 'CENTRE', TA_CENTER):
                x += 0.5*_sW
            elif a in ('RIGHT', TA_RIGHT):
                x += _sW
            elif a not in ('LEFT', TA_LEFT):
                raise ValueError("Bad hAlign value " + str(a))
        canv.saveState()
        if self.img_data is not None:
            img = self.img_data
        else:
            img = self.cache[self.cacheindex]
        if isinstance(img, PdfDict):
            xscale = self.img_width / img.BBox[2]
            yscale = self.img_height / img.BBox[3]
            canv.translate(x, y)
            canv.scale(xscale, yscale)
            canv.doForm(makerl(canv, img))
        else:
            canv.drawImage(img, x, y, self.img_width, self.img_height)
        canv.restoreState()


class PdfImageCache(object):
    """
    Saves matplotlib figures to a temporary multi-page PDF file using the 'savefig'
    method. When closed the images are extracted and saved to the attribute 'cache'.
    The temporary PDF file is then deleted. The 'savefig' returns a PdfImage object
    with a pointer to the 'cache' list and an index for the figure. Use of this
    cache reduces duplicated resources in the reportlab generated PDF file.

    Use is similar to matplotlib's PdfPages object. When not used as a context
    manager, the 'close()' method must be explictly called before the reportlab
    document is built.
    """
    def __init__(self):
        self.pdftempfile = '_temporary_pdf_image_cache_.pdf'
        self.pdf = PdfPages(self.pdftempfile)
        self.cache = []
        self.count = 0

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self.close()

    def close(self, *args):
        self.pdf.close()
        pages = PdfReader(self.pdftempfile).pages
        pages = [pagexobj(x) for x in pages]
        self.cache.extend(pages)
        os.remove(self.pdftempfile)

    def savefig(self, fig, width=200, height=200):
        self.pdf.savefig(fig)
        index = self.count
        self.count += 1
        return PdfImage(width=width, height=height, cache=self.cache, cacheindex=index)


def make_report(outfn, nfig=5):
    """
    Makes a dummy report with nfig matplotlib plots.
    """

    doc = SimpleDocTemplate(outfn)
    style = styles["Normal"]
    story = [Spacer(0, inch)]

    for j in range(nfig):

        fig = plt.figure(figsize=(4, 3))
        plt.plot([1, 2, 3, 4], [1, 4, 9, 26])
        plt.ylabel('some numbers')
        plt.title('My Figure %i' % (j+1))
        img = PdfImage(fig, width=400, height=400)
        plt.close()

        for i in range(10):
            bogustext = ("Paragraph number %s. " % i)
            p = Paragraph(bogustext, style)
            story.append(p)
            story.append(Spacer(1, 0.2*inch))

        story.append(img)

        for i in range(10):
            bogustext = ("Paragraph number %s. " % i)
            p = Paragraph(bogustext, style)
            story.append(p)
            story.append(Spacer(1, 0.2*inch))

    doc.build(story)


def make_report_cached_figs(outfn, nfig=5):
    """
    Makes a dummy report with nfig matplotlib plots using PdfImageCache
    to reduce PDF file size.
    """

    doc = SimpleDocTemplate(outfn)
    style = styles["Normal"]
    story = [Spacer(0, inch)]

    with PdfImageCache() as pdfcache:
        for j in range(nfig):

            fig = plt.figure(figsize=(4, 3))
            plt.plot([1, 2, 3, 4], [1, 4, 9, 26])
            plt.ylabel('some numbers')
            plt.title('My Figure %i' % (j+1))
            img = pdfcache.savefig(fig, width=400, height=400)
            plt.close()

            for i in range(10):
                bogustext = ("Paragraph number %s. " % i)
                p = Paragraph(bogustext, style)
                story.append(p)
                story.append(Spacer(1, 0.2*inch))

            story.append(img)

            for i in range(10):
                bogustext = ("Paragraph number %s. " % i)
                p = Paragraph(bogustext, style)
                story.append(p)
                story.append(Spacer(1, 0.2*inch))

    doc.build(story)


make_report("hello_pdf.pdf", 50)
make_report_cached_figs("hello_pdf_cached_figs.pdf", 50)

Since matplotlib's PdfPages only takes a file path as input, the PdfImageCache object writes the multi-page PDF to a temporary file. Trying to do it in memory would take a lot more work.

like image 129
Larry Meyn Avatar answered Oct 25 '22 18:10

Larry Meyn