Logo Questions Linux Laravel Mysql Ubuntu Git Menu

Convert scanned pdf to text python

I have a scanned pdf file and I try to extract text from it. I tried to use pypdfocr to make ocr on it but I have error:

"could not found ghostscript in the usual place"

After searching I found this solution Linking Ghostscript to pypdfocr in Windows Platform and I tried to download GhostScript and put it in environment variable but it still has the same error.

How can I searh text in my scanned pdf file using python?


Edit: here is my code sample:

import os
import sys
import re
import json
import shutil
import glob
from pypdfocr import pypdfocr_gs
from pypdfocr import pypdfocr_tesseract 
from PIL import Image

mainL = []
kk = {}

def new_init(self, kk):
    self.lang = 'heb'   
    self.binary = "tesseract"
    self.msgs = {
            'TS_MISSING': """ 
                Could not execute %s
                Please make sure you have Tesseract installed correctly
                """ % self.binary,
            'TS_VERSION':'Tesseract version is too old',
            'TS_img_MISSING':'Cannot find specified tiff file',
            'TS_FAILED': 'Tesseract-OCR execution failed!',

pypdfocr_tesseract.PyTesseract.__init__ = new_init  

wow = pypdfocr_gs.PyGs(kk)
tt = pypdfocr_tesseract.PyTesseract(kk)

def secFile(filename,oldfilename):

    files = glob.glob("X:/e206333106/ocr-114/balagan/" + '*.jpg')  
    for file in files:
        im = Image.open(file)
        im.save(file + ".tiff") 

    files = glob.glob("PATH" + '*.tiff')  
    for file in files:
    pdftxt = ""    
    files = glob.glob("PATH" + '*.html') 
    for file in files:
        with open(file) as myfile:
            pdftxt = pdftxt + "#" + "".join(line.rstrip() for line in myfile)

    folder ="PATH"

    for the_file in os.listdir(folder):
        file_path = os.path.join(folder, the_file)
            if os.path.isfile(file_path):
        except Exception, e:
            print e

def pdf2ocr(filename):
    pdffile = filename
    os.system('pypdfocr -l heb ' + pdffile)

def ocr2txt(filename):  
    pdffile = filename

    output1 = pdffile.replace(".pdf","_ocr.txt")
    output1 = "PATH" + os.path.basename(output1)

    input1 = pdffile.replace(".pdf","_ocr.pdf")

    os.system("pdf2txt" -o  + output1 + " " + input1) 

    with open(output1) as myfile:
        pdftxt="".join(line.rstrip() for line in myfile)

def findNum(pdftxt,pdffile):
    l = re.findall(r'\b\d+\b', pdftxt)

    output = open('PATH' + os.path.basename(pdffile) + '.txt', 'w')
    for i in l:

def is_ascii(s):
    return all(ord(c) < 128 for c in s)

i = 0     
files = glob.glob(path + '\\*.pdf') 
print path  
print files 
for file in files:
    if file.endswith(".pdf"):
        if is_ascii(file):
            print file
            newname = "PATH" + str(i) + ".pdf"
            shutil.copyfile(file, newname)
            print newname
        i = i + 1

files = glob.glob(path + '\\' + '*_ocr.pdf')         

for file in files:
    print file
    shutil.copyfile(file, "PATH" + os.path.basename(file))
like image 370
Michal Avatar asked Aug 03 '17 09:08


People also ask

How do I convert a scanned PDF to text?

Open a PDF file containing a scanned image in Acrobat for Mac or PC. Click on the “Edit PDF” tool in the right pane. Acrobat automatically applies optical character recognition (OCR) to your document and converts it to a fully editable copy of your PDF. Click the text element you wish to edit and start typing.

Can you extract text from a scanned PDF?

When a document is scanned or photo copied into a PDF, it's essentially a digital photograph; any text within that PDF cannot be copied or changed. However, using an OCR (optical character recognition) program or online service you can convert text contained in an image or PDF into text that can be copied and edited.

1 Answers

Convert pdfs, using pytesseract to do the OCR, and export each page in the pdfs to a text file.

Install these....

conda install -c conda-forge pytesseract

conda install -c conda-forge tesseract

pip install pdf2image

import pytesseract
from pdf2image import convert_from_path
import glob

pdfs = glob.glob(r"yourPath\*.pdf")

for pdf_path in pdfs:
    pages = convert_from_path(pdf_path, 500)

    for pageNum,imgBlob in enumerate(pages):
        text = pytesseract.image_to_string(imgBlob,lang='eng')

        with open(f'{pdf_path[:-4]}_page{pageNum}.txt', 'w') as the_file:
like image 182
DougR Avatar answered Sep 19 '22 07:09
