Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Why can't get string with PIL and pytesseract?

It is a simple Optical Character Recognition (OCR) program in Python 3 to get string, I have uploaded the target gif file here, please download it and save it as /tmp/target.gif.

enter image description here

try:
    from PIL import Image
except ImportError:
    import Image
import pytesseract
print(pytesseract.image_to_string(Image.open('/tmp/target.gif')))

I paste all the error info here, please fix it to get the characters from image.

/usr/lib/python3/dist-packages/PIL/Image.py:925: UserWarning: Couldn't allocate palette entry for transparency
  "for transparency")
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/usr/local/lib/python3.5/dist-packages/pytesseract/pytesseract.py", line 309, in image_to_string
    }[output_type]()
  File "/usr/local/lib/python3.5/dist-packages/pytesseract/pytesseract.py", line 308, in <lambda>
    Output.STRING: lambda: run_and_get_output(*args),
  File "/usr/local/lib/python3.5/dist-packages/pytesseract/pytesseract.py", line 208, in run_and_get_output
    temp_name, input_filename = save_image(image)
  File "/usr/local/lib/python3.5/dist-packages/pytesseract/pytesseract.py", line 136, in save_image
    image.save(input_file_name, format=img_extension, **image.info)
  File "/usr/lib/python3/dist-packages/PIL/Image.py", line 1728, in save
    save_handler(self, fp, filename)
  File "/usr/lib/python3/dist-packages/PIL/GifImagePlugin.py", line 407, in _save
    _get_local_header(fp, im, (0, 0), flags)
  File "/usr/lib/python3/dist-packages/PIL/GifImagePlugin.py", line 441, in _get_local_header
    transparency = int(transparency)
TypeError: int() argument must be a string, a bytes-like object or a number, not 'tuple'

I convert it with convert command in bash.

convert  "/tmp/target.gif"   "/tmp/target.jpg"

I show /tmp/target.gif and /tmp/target.jpg here. enter image description here

Then execute the above python code again.

try:
    from PIL import Image
except ImportError:
    import Image
import pytesseract
print(pytesseract.image_to_string(Image.open('/tmp/target.jpg')))

Nothing can i get with the pytesseract.image_to_string(Image.open('/tmp/target.jpg')),i get blank character.

enter image description here For Trenton_M's code:

>>> img1 = remove_noise_and_smooth(r'/tmp/target.jpg')
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "<stdin>", line 3, in remove_noise_and_smooth
AttributeError: 'NoneType' object has no attribute 'astype'
Thalish Sajeed

For Thalish Sajeed's code:

enter image description here

Omit the error info caused by print(pytesseract.image_to_string(Image.open(filename))).

Type "help", "copyright", "credits" or "license" for more information.
>>> from PIL import Image
>>> import pytesseract
>>> import matplotlib.pyplot as plt
>>> import cv2
>>> import numpy as np
>>> 
>>> 
>>> def display_image(filename, length_box=60, width_box=30):
...     if type(filename) == np.ndarray:
...         image = filename
...     else:
...         image = cv2.imread(filename)
...     plt.figure(figsize=(length_box, width_box))
...     plt.imshow(image, cmap="gray")
... 
>>> 
>>> filename = r"/tmp/target.jpg"
>>> display_image(filename)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "<stdin>", line 7, in display_image
  File "/usr/local/lib/python3.5/dist-packages/matplotlib/pyplot.py", line 2699, in imshow
    None else {}), **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/matplotlib/__init__.py", line 1810, in inner
    return func(ax, *args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/matplotlib/axes/_axes.py", line 5494, in imshow
    im.set_data(X)
  File "/usr/local/lib/python3.5/dist-packages/matplotlib/image.py", line 634, in set_data
    raise TypeError("Image data cannot be converted to float")
TypeError: Image data cannot be converted to float
>>>

@Thalish Sajeed,Why i got 9244K instead of 0244k with your code? Here is my tested sample file.

enter image description here The extracted string.

enter image description here

@Trenton_M,correct a little typo and loss in your code,and delete the line plt.show() as your suggestion.

>>> import cv2,pytesseract
>>> import numpy as np
>>> import matplotlib.pyplot as plt
>>> 
>>> 
>>> def image_smoothening(img):
...     ret1, th1 = cv2.threshold(img, 88, 255, cv2.THRESH_BINARY)
...     ret2, th2 = cv2.threshold(th1, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
...     blur = cv2.GaussianBlur(th2, (5, 5), 0)
...     ret3, th3 = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
...     return th3
... 
>>> 
>>> def remove_noise_and_smooth(file_name):
...     img = cv2.imread(file_name, 0)
...     filtered = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 9, 41)
...     kernel = np.ones((1, 1), np.uint8)
...     opening = cv2.morphologyEx(filtered, cv2.MORPH_OPEN, kernel)
...     closing = cv2.morphologyEx(opening, cv2.MORPH_CLOSE, kernel)
...     img = image_smoothening(img)
...     or_image = cv2.bitwise_or(img, closing)
...     return or_image
... 
>>> 
>>> cv2_thresh_list = [cv2.THRESH_BINARY, cv2.THRESH_TRUNC, cv2.THRESH_TOZERO]
>>> fn = r'/tmp/target.jpg'
>>> img1 = remove_noise_and_smooth(fn)
>>> img2 = cv2.imread(fn, 0)
>>> for i, img in enumerate([img1, img2]):
...     img_type = {0: 'Preprocessed Images\n',
...                 1: '\nUnprocessed Images\n'}
...     print(img_type[i])
...     for item in cv2_thresh_list:
...         print('Thresh: {}'.format(str(item)))
...         _, thresh = cv2.threshold(img, 127, 255, item)
...         plt.imshow(thresh, 'gray')
...         f_name = '{0}.jpg'.format(str(item))
...         plt.savefig(f_name)
...         print('OCR Result: {}\n'.format(pytesseract.image_to_string(f_name)))

... Preprocessed Images

In my console ,all the output info are as following:

Thresh: 0
<matplotlib.image.AxesImage object at 0x7fbc2519a6d8>
OCR Result: 10
15
20 

Edfifi
10
2 o 30 40 so
so

Thresh: 2
<matplotlib.image.AxesImage object at 0x7fbc255e7eb8>
OCR Result: 10
15
20
Edfifi
10
2 o 30 40 so
so
Thresh: 3
<matplotlib.image.AxesImage object at 0x7fbc25452fd0>
OCR Result: 10
15
20
Edfifi
10
2 o 30 40 so
so
Unprocessed Images
Thresh: 0
<matplotlib.image.AxesImage object at 0x7fbc25464c88>
OCR Result: 10
15
20
Thresh: 2
<matplotlib.image.AxesImage object at 0x7fbc254520f0>
OCR Result: 10
15
2o
2o
30 40 50
Thresh: 3
<matplotlib.image.AxesImage object at 0x7fbc1e1968d0>
OCR Result: 10
15
20

Where is the string 0244R?

like image 724
showkey Avatar asked Jul 24 '19 13:07

showkey


People also ask

Does Pytesseract need Tesseract?

You can install Python packages, but also non-Python packages with the Anaconda Prompt. Since tesseract is non-Python package needed to use pytesseract, I think the Anaconda distribution of Python and the conda package manager is the way to go.

What is Pytesseract library in Python?

Pytesseract or Python-tesseract is an OCR tool for python that also serves as a wrapper for the Tesseract-OCR Engine. It can read and recognize text in images and is commonly used in python ocr image to text use cases.

How does Python-Tesseract work?

Python-tesseract is an optical character recognition (OCR) tool for python. That is, it will recognize and “read” the text embedded in images. Python-tesseract is a wrapper for Google's Tesseract-OCR Engine.


2 Answers

Let's start with the JPG image, because pytesseract has issues operating on GIF image formats. reference

filename = "/tmp/target.jpg"
image = cv2.imread(filename)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
ret, threshold = cv2.threshold(gray,55, 255, cv2.THRESH_BINARY)
print(pytesseract.image_to_string(threshold))

Let's try to breakdown the issues here.

Your image is too noisy for tesseract engine to identify the letters, We use some simple image processing techniques such as grayscaling and thresholding to remove some noise from the image.

Then when we send it to the OCR engine, we see that the letters are captured more accurately.

You can find my notebook where I tested this out if you follow this github link

Edit - I have updated the notebook with some additional image cleaning techniques. The source image is too noisy for tesseract to work directly out of the box on the image. You need to use image cleaning techniques.

You can vary the thresholding parameters or swap out gaussian blur for some other technique until you get your desired results.

If you are looking to run OCR on noisy images - please check out commercial OCR providers such as google-cloud-vision. They provide 1000 OCR calls free per month.

like image 86
Thalish Sajeed Avatar answered Nov 07 '22 03:11

Thalish Sajeed


First: make certain you've installed the Tesseract program (not just the python package)

Jupyter Notebook of Solution: Only the image passed through remove_noise_and_smooth is successfully translated with OCR.

When attempting to convert image.gif, TypeError: int() argument must be a string, a bytes-like object or a number, not 'tuple' is generated.

Rename image.gif to image.jpg, the TypeError is generated

Open image.gif and 'save as' image.jpg, the output is blank, which means the text wasn't recognized.

enter image description here

from PIL import Image
import pytesseract

# If you don't have tesseract executable in your PATH, include the following:
# your path may be different than mine
pytesseract.pytesseract.tesseract_cmd = "C:/Program Files (x86)/Tesseract-OCR/tesseract.exe"

imgo = Image.open('0244R_clean.jpg')

print(pytesseract.image_to_string(imgo))
  • No text is recognized from the original image, so it may require post-processing to clean prior to OCR
  • I created a clean image, which pytesseract extracts the text from without issue. The image is included below, so you can test it with your own code to verify functionality.

enter image description here

Add Post-Processing

Improve Accuracy of OCR using Image Preprocessing

OpenCV

import cv2
import numpy as np
import matplotlib.pyplot as plt


def image_smoothening(img):
    ret1, th1 = cv2.threshold(img, 88, 255, cv2.THRESH_BINARY)
    ret2, th2 = cv2.threshold(th1, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    blur = cv2.GaussianBlur(th2, (5, 5), 0)
    ret3, th3 = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return th3


def remove_noise_and_smooth(file_name):
    img = cv2.imread(file_name, 0)
    filtered = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 9, 41)
    kernel = np.ones((1, 1), np.uint8)
    opening = cv2.morphologyEx(filtered, cv2.MORPH_OPEN, kernel)
    closing = cv2.morphologyEx(opening, cv2.MORPH_CLOSE, kernel)
    img = image_smoothening(img)
    or_image = cv2.bitwise_or(img, closing)
    return or_image


cv2_thresh_list = [cv2.THRESH_BINARY, cv2.THRESH_TRUNC, cv2.THRESH_TOZERO]

fn = r'/tmp/target.jpg'
img1 = remove_noise_and_smooth(fn)
img2 = cv2.imread(fn, 0)
for i, img in enumerate([img1, img2]):
    img_type = {0: 'Preprocessed Images\n',
                1: '\nUnprocessed Images\n'}
    print(img_type[i])
    for item in cv2_thresh_list:
        print('Thresh: {}'.format(str(item)))
        _, thresh = cv2.threshold(img, 127, 255, item)
        plt.imshow(thresh, 'gray')
        f_name = '{}_{}.jpg'.format(i, str(item))
        plt.savefig(f_name)
        print('OCR Result: {}\n'.format(pytesseract.image_to_string(f_name)))

img1 will generate the following new images:

enter image description here

img2 will generate these new images:

enter image description here

like image 38
Trenton McKinney Avatar answered Nov 07 '22 04:11

Trenton McKinney