Convert scanned pdf to text python

Tags:

I have a scanned pdf file and I try to extract text from it. I tried to use pypdfocr to make ocr on it but I have error:

"could not found ghostscript in the usual place"

After searching I found this solution Linking Ghostscript to pypdfocr in Windows Platform and I tried to download GhostScript and put it in environment variable but it still has the same error.

How can I searh text in my scanned pdf file using python?

Thanks.

Edit: here is my code sample:

Click to copy

import os
import sys
import re
import json
import shutil
import glob
from pypdfocr import pypdfocr_gs
from pypdfocr import pypdfocr_tesseract 
from PIL import Image

path = PATH_TO_MY_SCANNED_PDF
mainL = []
kk = {}


def new_init(self, kk):
    self.lang = 'heb'   
    self.binary = "tesseract"
    self.msgs = {
            'TS_MISSING': """ 
                Could not execute %s
                Please make sure you have Tesseract installed correctly
                """ % self.binary,
            'TS_VERSION':'Tesseract version is too old',
            'TS_img_MISSING':'Cannot find specified tiff file',
            'TS_FAILED': 'Tesseract-OCR execution failed!',
        }

pypdfocr_tesseract.PyTesseract.__init__ = new_init  

wow = pypdfocr_gs.PyGs(kk)
tt = pypdfocr_tesseract.PyTesseract(kk)


def secFile(filename,oldfilename):
    wow.make_img_from_pdf(filename)


    files = glob.glob("X:/e206333106/ocr-114/balagan/" + '*.jpg')  
    for file in files:
        im = Image.open(file)
        im.save(file + ".tiff") 

    files = glob.glob("PATH" + '*.tiff')  
    for file in files:
        tt.make_hocr_from_pnm(file)
    pdftxt = ""    
    files = glob.glob("PATH" + '*.html') 
    for file in files:
        with open(file) as myfile:
            pdftxt = pdftxt + "#" + "".join(line.rstrip() for line in myfile)
    findNum(pdftxt,oldfilename)

    folder ="PATH"

    for the_file in os.listdir(folder):
        file_path = os.path.join(folder, the_file)
        try:
            if os.path.isfile(file_path):
                os.unlink(file_path)
        except Exception, e:
            print e

def pdf2ocr(filename):
    pdffile = filename
    os.system('pypdfocr -l heb ' + pdffile)

def ocr2txt(filename):  
    pdffile = filename


    output1 = pdffile.replace(".pdf","_ocr.txt")
    output1 = "PATH" + os.path.basename(output1)

    input1 = pdffile.replace(".pdf","_ocr.pdf")

    os.system("pdf2txt" -o  + output1 + " " + input1) 

    with open(output1) as myfile:
        pdftxt="".join(line.rstrip() for line in myfile)
    findNum(pdftxt,filename)


def findNum(pdftxt,pdffile):
    l = re.findall(r'\b\d+\b', pdftxt)


    output = open('PATH' + os.path.basename(pdffile) + '.txt', 'w')
    for i in l:
        output.write(",")
        output.write(i)
    output.close()    

def is_ascii(s):
    return all(ord(c) < 128 for c in s)

i = 0     
files = glob.glob(path + '\\*.pdf') 
print path  
print files 
for file in files:
    if file.endswith(".pdf"):
        if is_ascii(file):
            print file
            pdf2ocr(file)    
            ocr2txt(file)
        else:
            newname = "PATH" + str(i) + ".pdf"
            shutil.copyfile(file, newname)
            print newname
            secFile(newname,file)
        i = i + 1

files = glob.glob(path + '\\' + '*_ocr.pdf')         

for file in files:
    print file
    shutil.copyfile(file, "PATH" + os.path.basename(file))
    os.remove(file)

370

asked Aug 03 '17 09:08

Michal

1 Answers

Convert pdfs, using pytesseract to do the OCR, and export each page in the pdfs to a text file.

Install these....

conda install -c conda-forge pytesseract

conda install -c conda-forge tesseract

pip install pdf2image

Click to copy

import pytesseract
from pdf2image import convert_from_path
import glob

pdfs = glob.glob(r"yourPath\*.pdf")

for pdf_path in pdfs:
    pages = convert_from_path(pdf_path, 500)

    for pageNum,imgBlob in enumerate(pages):
        text = pytesseract.image_to_string(imgBlob,lang='eng')

        with open(f'{pdf_path[:-4]}_page{pageNum}.txt', 'w') as the_file:
            the_file.write(text)

182

answered Sep 19 '22 07:09

DougR

Related questions
                            
                                Can a Jupyter / IPython notebook take arguments in the URL?
                            
                                Using django for CLI tool
                            
                                Difference between BaseSpider and CrawlSpider
                            
                                Display and format Django DurationField in template
                            
                                While submit job with pyspark, how to access static files upload with --files argument?
                            
                                Python yield vs Ruby yield
                            
                                Filter by whether column value equals a list in Spark
                            
                                map vs list; why different behaviour?
                            
                                In Python, does 'return self' return a copy of the object or a pointer?
                            
                                Pycharm does not see files in relative path with ../
                            
                                Selenium leaves behind running processes?
                            
                                How to use dateutil.relativedelta in Python 3.x?
                            
                                404 Response when running FlaskClient test method
                            
                                Python - unpacking kwargs in local function call
                            
                                Selenium can't find element by name or id (python)
                            
                                ArrayField missing 1 required positional argument
                            
                                Adding pandas columns to a sparse matrix
                            
                                Fastest way to left-cycle a numpy array (like pop, push for a queue)
                            
                                How to download a HTML webpage using Selenium with python?
                            
                                How to filter objects by ignoring upper and lower case letter django

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With

Convert scanned pdf to text python

Tags:

python

pdf

ocr

ghostscript

Michal

People also ask

1 Answers

DougR

Recent Activity

Donate For Us