Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

split a pdf based on outline

Tags:

python

pdf

pypdf

i would like to use pyPdf to split a pdf file based on the outline where each destination in the outline refers to a different page within the pdf.

example outline:

main       --> points to page 1
  sect1    --> points to page 1
  sect2    --> points to page 15
  sect3    --> points to page 22

it is easy within pyPdf to iterate over each page of the document or each destination in the document's outline; however, i cannot figure out how to get the page number where the destination points.

does anybody know how to find the referencing page number for each destination in the outline?

like image 368
darrell Avatar asked Dec 16 '09 23:12

darrell


1 Answers

I figured it out:

class Darrell(pyPdf.PdfFileReader):

    def getDestinationPageNumbers(self):
        def _setup_outline_page_ids(outline, _result=None):
            if _result is None:
                _result = {}
            for obj in outline:
                if isinstance(obj, pyPdf.pdf.Destination):
                    _result[(id(obj), obj.title)] = obj.page.idnum
                elif isinstance(obj, list):
                    _setup_outline_page_ids(obj, _result)
            return _result

        def _setup_page_id_to_num(pages=None, _result=None, _num_pages=None):
            if _result is None:
                _result = {}
            if pages is None:
                _num_pages = []
                pages = self.trailer["/Root"].getObject()["/Pages"].getObject()
            t = pages["/Type"]
            if t == "/Pages":
                for page in pages["/Kids"]:
                    _result[page.idnum] = len(_num_pages)
                    _setup_page_id_to_num(page.getObject(), _result, _num_pages)
            elif t == "/Page":
                _num_pages.append(1)
            return _result

        outline_page_ids = _setup_outline_page_ids(self.getOutlines())
        page_id_to_page_numbers = _setup_page_id_to_num()

        result = {}
        for (_, title), page_idnum in outline_page_ids.iteritems():
            result[title] = page_id_to_page_numbers.get(page_idnum, '???')
        return result

pdf = Darrell(open(PATH-TO-PDF, 'rb'))
template = '%-5s  %s'
print template % ('page', 'title')
for p,t in sorted([(v,k) for k,v in pdf.getDestinationPageNumbers().iteritems()]):
    print template % (p+1,t)
like image 58
darrell Avatar answered Oct 19 '22 12:10

darrell