Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

pdf form filled with PyPDF2 does not show in print

I need to fill pdf form in batch, so tried to write a python code to do it for me from a csv file. I used second answer in this question and it fills the forms fine, however when I open the filled forms the answers does not show unless the corresponding field is selected. Also the answers does not show when the form is printed. I looked into PyPDF2 documents to see if I can flatten the generated forms but this features has not been implemented yet even though has been asked for about a year ago. My preference is not to use pdftk so I can compile the script without the need for more dependency. When using the original code in the mentioned question, some fields show in the print and some doesn't which makes me confused on how they're working. Any help is appreciated.

Here's the code.

# -*- coding: utf-8 -*-

from collections import OrderedDict
from PyPDF2 import PdfFileWriter, PdfFileReader


def _getFields(obj, tree=None, retval=None, fileobj=None):
    """
    Extracts field data if this PDF contains interactive form fields.
    The *tree* and *retval* parameters are for recursive use.

    :param fileobj: A file object (usually a text file) to write
    a report to on all interactive form fields found.
    :return: A dictionary where each key is a field name, and each
    value is a :class:`Field<PyPDF2.generic.Field>` object. By
    default, the mapping name is used for keys.
    :rtype: dict, or ``None`` if form data could not be located.
    """
    fieldAttributes = {'/FT': 'Field Type', '/Parent': 'Parent', '/T': 'Field Name', '/TU': 'Alternate Field Name',
                   '/TM': 'Mapping Name', '/Ff': 'Field Flags', '/V': 'Value', '/DV': 'Default Value'}
    if retval is None:
        retval = {} #OrderedDict()
        catalog = obj.trailer["/Root"]
        # get the AcroForm tree
        if "/AcroForm" in catalog:
            tree = catalog["/AcroForm"]
        else:
            return None
    if tree is None:
        return retval

    obj._checkKids(tree, retval, fileobj)
    for attr in fieldAttributes:
        if attr in tree:
            # Tree is a field
            obj._buildField(tree, retval, fileobj, fieldAttributes)
            break

    if "/Fields" in tree:
        fields = tree["/Fields"]
        for f in fields:
            field = f.getObject()
            obj._buildField(field, retval, fileobj, fieldAttributes)

    return retval


def get_form_fields(infile):
    infile = PdfFileReader(open(infile, 'rb'))
    fields = _getFields(infile)
    return {k: v.get('/V', '') for k, v in fields.items()}


def update_form_values(infile, outfile, newvals=None):
    pdf = PdfFileReader(open(infile, 'rb'))
    writer = PdfFileWriter()

    for i in range(pdf.getNumPages()):
        page = pdf.getPage(i)
        try:
            if newvals:
                writer.updatePageFormFieldValues(page, newvals)
            else:
                writer.updatePageFormFieldValues(page,
                                             {k: f'#{i} {k}={v}'
                                              for i, (k, v) in 
enumerate(get_form_fields(infile).items())
                                              })
            writer.addPage(page)
        except Exception as e:
            print(repr(e))
            writer.addPage(page)

    with open(outfile, 'wb') as out:
        writer.write(out)


if __name__ == '__main__':
    import csv    
    import os
    from glob import glob
    cwd=os.getcwd()
    outdir=os.path.join(cwd,'output')
    csv_file_name=os.path.join(cwd,'formData.csv')
    pdf_file_name=glob(os.path.join(cwd,'*.pdf'))[0]
    if not pdf_file_name:
        print('No pdf file found')
    if not os.path.isdir(outdir):
        os.mkdir(outdir)
    if not os.path.isfile(csv_file_name):
        fields=get_form_fields(pdf_file_name)
        with open(csv_file_name,'w',newline='') as csv_file:
            csvwriter=csv.writer(csv_file,delimiter=',')
            csvwriter.writerow(['user label'])
            csvwriter.writerow(['fields']+list(fields.keys()))
            csvwriter.writerow(['Mr. X']+list(fields.values()))
    else:
        with open(csv_file_name,'r',newline='') as csv_file:
            csvreader=csv.reader(csv_file,delimiter=',')
            csvdata=list(csvreader)
        fields=csvdata[1][1:]
        for frmi in csvdata[2:]:
            frmdict=dict(zip(fields,frmi[1:]))
            outfile=os.path.join(outdir,frmi[0]+'.pdf')
            update_form_values(pdf_file_name, outfile,frmdict)
like image 327
anishtain4 Avatar asked Jan 03 '23 04:01

anishtain4


2 Answers

I had the same issue and apparently adding the "/NeedsAppearance" attribute to the PdfWriter object of the AcroForm fixed the problem (see https://github.com/mstamy2/PyPDF2/issues/355). With much help from ademidun (https://github.com/ademidun), I was able to populate a pdf form and have the values of the fields show properly. The following is an example:

from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2.generic import BooleanObject, NameObject, IndirectObject


def set_need_appearances_writer(writer):
    # See 12.7.2 and 7.7.2 for more information:
    # http://www.adobe.com/content/dam/acom/en/devnet/acrobat/
    #     pdfs/PDF32000_2008.pdf
    try:
        catalog = writer._root_object
        # get the AcroForm tree and add "/NeedAppearances attribute
        if "/AcroForm" not in catalog:
            writer._root_object.update(
                {
                    NameObject("/AcroForm"): IndirectObject(
                        len(writer._objects), 0, writer
                    )
                }
            )

        need_appearances = NameObject("/NeedAppearances")
        writer._root_object["/AcroForm"][need_appearances] = BooleanObject(True)
        return writer

    except Exception as e:
        print("set_need_appearances_writer() catch : ", repr(e))
        return writer


reader = PdfFileReader("myInputPdf.pdf", strict=False)
if "/AcroForm" in reader.trailer["/Root"]:
    reader.trailer["/Root"]["/AcroForm"].update(
        {NameObject("/NeedAppearances"): BooleanObject(True)}
    )

writer = PdfFileWriter()
set_need_appearances_writer(writer)
if "/AcroForm" in writer._root_object:
    writer._root_object["/AcroForm"].update(
        {NameObject("/NeedAppearances"): BooleanObject(True)}
    )

field_dictionary = {"Field1": "Value1", "Field2": "Value2"}

writer.addPage(reader.getPage(0))
writer.updatePageFormFieldValues(writer.getPage(0), field_dictionary)

with open("myOutputPdf.pdf", "wb") as fp:
    writer.write(fp)
like image 119
tromar Avatar answered Jan 05 '23 17:01

tromar


The underlying reason form fields are not showing up after being filled in, is that the values are not being added to the stream. Adding "NeedAppearances" tells the PDF reader that it needs to update the appearance, in this case it needs to create a stream for each field value, but not all PDF readers will honor that, and the fields may still look blank or have the default values.

The best solution to make sure the fields are updated for any reader is to create a stream for each field and add it to the field's XObject.

Here is an example solution for single line text fields. It also encodes the stream, updates the default value, and sets the fields to read only, which are all optional.

# Example data.
data = {
    "field_name": "some value"
}

# Get template.
template = PdfReader("template-form.pdf", strict=False)

# Initialize writer.
writer = PdfWriter()

# Add the template page.
writer.add_page(template.pages[0])

# Get page annotations.
page_annotations = writer.pages[0][PageAttributes.ANNOTS]

# Loop through page annotations (fields).
for index in range(len(page_annotations)):  # type: ignore
    # Get annotation object.
    annotation = page_annotations[index].get_object()  # type: ignore

    # Get existing values needed to create the new stream and update the field.
    field = annotation.get(NameObject("/T"))
    new_value = data.get(field, 'N/A')
    ap = annotation.get(AnnotationDictionaryAttributes.AP)
    x_object = ap.get(NameObject("/N")).get_object()
    font = annotation.get(InteractiveFormDictEntries.DA)
    rect = annotation.get(AnnotationDictionaryAttributes.Rect)

    # Calculate the text position.
    font_size = float(font.split(" ")[1])
    w = round(float(rect[2] - rect[0] - 2), 2)
    h = round(float(rect[3] - rect[1] - 2), 2)
    text_position_h = h / 2 - font_size / 3  # approximation

    # Create a new XObject stream.
    new_stream = f'''
        /Tx BMC 
        q
        1 1 {w} {h} re W n
        BT
        {font}
        2 {text_position_h} Td
        ({new_value}) Tj
        ET
        Q
        EMC
    '''

    # Add Filter type to XObject.
    x_object.update(
        {
            NameObject(StreamAttributes.FILTER): NameObject(FilterTypes.FLATE_DECODE)
        }
    )

    # Update and encode XObject stream.
    x_object._data = FlateDecode.encode(encode_pdfdocencoding(new_stream))

    # Update annotation dictionary.
    annotation.update(
        {
            # Update Value.
            NameObject(FieldDictionaryAttributes.V): TextStringObject(
                new_value
            ),
            # Update Default Value.
            NameObject(FieldDictionaryAttributes.DV): TextStringObject(
                new_value
            ),
            # Set Read Only flag.
            NameObject(FieldDictionaryAttributes.Ff): NumberObject(
                FieldFlag(1)
            )
        }
    )

# Clone document root & metadata from template.
# This is required so that the document doesn't try to save before closing.
writer.clone_reader_document_root(template)

# write "output".
with open(f"output.pdf", "wb") as output_stream:
    writer.write(output_stream)  # type: ignore

Thanks to fidoriel and others from the discussion here: https://github.com/py-pdf/PyPDF2/issues/355.

like image 25
JeremyM4n Avatar answered Jan 05 '23 18:01

JeremyM4n