Is there any python module to convert PDF files into text? I tried one piece of code found in Activestate which uses pypdf but the text generated had no space between and was of no use.
open() to open files and f. close() to close it.
Extracting Text from PDF FilePython package PyPDF can be used to achieve what we want (text extraction), although it can do more than what we need. This package can also be used to generate, decrypting and merging PDF files.
There are many applications to what OCR can do in term of document intelligence. Using pytesseract, one can extract almost all the data irrespective of the format of the documents (whether its a scanned document or a pdf or a simple jpeg image).
Try PDFMiner. It can extract text from PDF files as HTML, SGML or "Tagged PDF" format.
The Tagged PDF format seems to be the cleanest, and stripping out the XML tags leaves just the bare text.
A Python 3 version is available under:
The PDFMiner package has changed since codeape posted.
EDIT (again):
PDFMiner has been updated again in version 20100213
You can check the version you have installed with the following:
>>> import pdfminer >>> pdfminer.__version__ '20100213'
Here's the updated version (with comments on what I changed/added):
def pdf_to_csv(filename): from cStringIO import StringIO #<-- added so you can copy/paste this to try it from pdfminer.converter import LTTextItem, TextConverter from pdfminer.pdfparser import PDFDocument, PDFParser from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter class CsvConverter(TextConverter): def __init__(self, *args, **kwargs): TextConverter.__init__(self, *args, **kwargs) def end_page(self, i): from collections import defaultdict lines = defaultdict(lambda : {}) for child in self.cur_item.objs: if isinstance(child, LTTextItem): (_,_,x,y) = child.bbox #<-- changed line = lines[int(-y)] line[x] = child.text.encode(self.codec) #<-- changed for y in sorted(lines.keys()): line = lines[y] self.outfp.write(";".join(line[x] for x in sorted(line.keys()))) self.outfp.write("\n") # ... the following part of the code is a remix of the # convert() function in the pdfminer/tools/pdf2text module rsrc = PDFResourceManager() outfp = StringIO() device = CsvConverter(rsrc, outfp, codec="utf-8") #<-- changed # becuase my test documents are utf-8 (note: utf-8 is the default codec) doc = PDFDocument() fp = open(filename, 'rb') parser = PDFParser(fp) #<-- changed parser.set_document(doc) #<-- added doc.set_parser(parser) #<-- added doc.initialize('') interpreter = PDFPageInterpreter(rsrc, device) for i, page in enumerate(doc.get_pages()): outfp.write("START PAGE %d\n" % i) interpreter.process_page(page) outfp.write("END PAGE %d\n" % i) device.close() fp.close() return outfp.getvalue()
Edit (yet again):
Here is an update for the latest version in pypi, 20100619p1
. In short I replaced LTTextItem
with LTChar
and passed an instance of LAParams to the CsvConverter constructor.
def pdf_to_csv(filename): from cStringIO import StringIO from pdfminer.converter import LTChar, TextConverter #<-- changed from pdfminer.layout import LAParams from pdfminer.pdfparser import PDFDocument, PDFParser from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter class CsvConverter(TextConverter): def __init__(self, *args, **kwargs): TextConverter.__init__(self, *args, **kwargs) def end_page(self, i): from collections import defaultdict lines = defaultdict(lambda : {}) for child in self.cur_item.objs: if isinstance(child, LTChar): #<-- changed (_,_,x,y) = child.bbox line = lines[int(-y)] line[x] = child.text.encode(self.codec) for y in sorted(lines.keys()): line = lines[y] self.outfp.write(";".join(line[x] for x in sorted(line.keys()))) self.outfp.write("\n") # ... the following part of the code is a remix of the # convert() function in the pdfminer/tools/pdf2text module rsrc = PDFResourceManager() outfp = StringIO() device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams()) #<-- changed # becuase my test documents are utf-8 (note: utf-8 is the default codec) doc = PDFDocument() fp = open(filename, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize('') interpreter = PDFPageInterpreter(rsrc, device) for i, page in enumerate(doc.get_pages()): outfp.write("START PAGE %d\n" % i) if page is not None: interpreter.process_page(page) outfp.write("END PAGE %d\n" % i) device.close() fp.close() return outfp.getvalue()
EDIT (one more time):
Updated for version 20110515
(thanks to Oeufcoque Penteano!):
def pdf_to_csv(filename): from cStringIO import StringIO from pdfminer.converter import LTChar, TextConverter from pdfminer.layout import LAParams from pdfminer.pdfparser import PDFDocument, PDFParser from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter class CsvConverter(TextConverter): def __init__(self, *args, **kwargs): TextConverter.__init__(self, *args, **kwargs) def end_page(self, i): from collections import defaultdict lines = defaultdict(lambda : {}) for child in self.cur_item._objs: #<-- changed if isinstance(child, LTChar): (_,_,x,y) = child.bbox line = lines[int(-y)] line[x] = child._text.encode(self.codec) #<-- changed for y in sorted(lines.keys()): line = lines[y] self.outfp.write(";".join(line[x] for x in sorted(line.keys()))) self.outfp.write("\n") # ... the following part of the code is a remix of the # convert() function in the pdfminer/tools/pdf2text module rsrc = PDFResourceManager() outfp = StringIO() device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams()) # becuase my test documents are utf-8 (note: utf-8 is the default codec) doc = PDFDocument() fp = open(filename, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize('') interpreter = PDFPageInterpreter(rsrc, device) for i, page in enumerate(doc.get_pages()): outfp.write("START PAGE %d\n" % i) if page is not None: interpreter.process_page(page) outfp.write("END PAGE %d\n" % i) device.close() fp.close() return outfp.getvalue()
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With