Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Reading PDF documents in .Net [closed]

Tags:

c#

.net

pdf

Is there an open source library that will help me with reading/parsing PDF documents in .NET/C#?

like image 238
JRoppert Avatar asked Sep 17 '08 13:09

JRoppert


People also ask

Why is PDF blacked out?

REASON. This problem is usually caused by corrupted PDF files produced by outdated scanning software. When Pipeline tries to read such a document, it is unable to read some of the information properly thereby resulting in one of these problems.

How can I open PDF file in asp net?

Open Visual Studio 2012 and click "File" -> "New" -> "web site...". A window is opened. In this window, click "Empty Web Site Application" under Visual C#. Give the name of your application as "Open_PDF" and then click "Ok".

Why Online PDF is not opening?

Try resetting the display preference in your browser to clear up the viewing issue. In Reader or Acrobat, right-click the document window, and choose Page Display Preferences. From the list at left, select Internet. Deselect Display PDF in browser, and then click OK.


2 Answers

Since this question was last answered in 2008, iTextSharp has improved their api dramatically. If you download the latest version of their api from http://sourceforge.net/projects/itextsharp/, you can use the following snippet of code to extract all text from a pdf into a string.

using iTextSharp.text.pdf; using iTextSharp.text.pdf.parser;  namespace PdfParser {     public static class PdfTextExtractor     {         public static string pdfText(string path)         {             PdfReader reader = new PdfReader(path);             string text = string.Empty;             for(int page = 1; page <= reader.NumberOfPages; page++)             {                 text += PdfTextExtractor.GetTextFromPage(reader,page);             }             reader.Close();             return text;         }        } } 
like image 159
Brock Nusser Avatar answered Sep 21 '22 21:09

Brock Nusser


iTextSharp is the best bet. Used it to make a spider for lucene.Net so that it could crawl PDF.

using System; using System.IO; using iTextSharp.text.pdf; using System.Text.RegularExpressions;  namespace Spider.Utils {     /// <summary>     /// Parses a PDF file and extracts the text from it.     /// </summary>     public class PDFParser     {         /// BT = Beginning of a text object operator          /// ET = End of a text object operator         /// Td move to the start of next line         ///  5 Ts = superscript         /// -5 Ts = subscript          #region Fields          #region _numberOfCharsToKeep         /// <summary>         /// The number of characters to keep, when extracting text.         /// </summary>         private static int _numberOfCharsToKeep = 15;         #endregion          #endregion          #region ExtractText         /// <summary>         /// Extracts a text from a PDF file.         /// </summary>         /// <param name="inFileName">the full path to the pdf file.</param>         /// <param name="outFileName">the output file name.</param>         /// <returns>the extracted text</returns>         public bool ExtractText(string inFileName, string outFileName)         {             StreamWriter outFile = null;             try             {                 // Create a reader for the given PDF file                 PdfReader reader = new PdfReader(inFileName);                 //outFile = File.CreateText(outFileName);                 outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8);                  Console.Write("Processing: ");                  int totalLen = 68;                 float charUnit = ((float)totalLen) / (float)reader.NumberOfPages;                 int totalWritten = 0;                 float curUnit = 0;                  for (int page = 1; page <= reader.NumberOfPages; page++)                 {                     outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " ");                      // Write the progress.                     if (charUnit >= 1.0f)                     {                         for (int i = 0; i < (int)charUnit; i++)                         {                             Console.Write("#");                             totalWritten++;                         }                     }                     else                     {                         curUnit += charUnit;                         if (curUnit >= 1.0f)                         {                             for (int i = 0; i < (int)curUnit; i++)                             {                                 Console.Write("#");                                 totalWritten++;                             }                             curUnit = 0;                         }                      }                 }                  if (totalWritten < totalLen)                 {                     for (int i = 0; i < (totalLen - totalWritten); i++)                     {                         Console.Write("#");                     }                 }                 return true;             }             catch             {                 return false;             }             finally             {                 if (outFile != null) outFile.Close();             }         }         #endregion          #region ExtractTextFromPDFBytes         /// <summary>         /// This method processes an uncompressed Adobe (text) object          /// and extracts text.         /// </summary>         /// <param name="input">uncompressed</param>         /// <returns></returns>         public string ExtractTextFromPDFBytes(byte[] input)         {             if (input == null || input.Length == 0) return "";              try             {                 string resultString = "";                  // Flag showing if we are we currently inside a text object                 bool inTextObject = false;                  // Flag showing if the next character is literal                  // e.g. '\\' to get a '\' character or '\(' to get '('                 bool nextLiteral = false;                  // () Bracket nesting level. Text appears inside ()                 int bracketDepth = 0;                  // Keep previous chars to get extract numbers etc.:                 char[] previousCharacters = new char[_numberOfCharsToKeep];                 for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';                   for (int i = 0; i < input.Length; i++)                 {                     char c = (char)input[i];                     if (input[i] == 213)                         c = "'".ToCharArray()[0];                      if (inTextObject)                     {                         // Position the text                         if (bracketDepth == 0)                         {                             if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))                             {                                 resultString += "\n\r";                             }                             else                             {                                 if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters))                                 {                                     resultString += "\n";                                 }                                 else                                 {                                     if (CheckToken(new string[] { "Tj" }, previousCharacters))                                     {                                         resultString += " ";                                     }                                 }                             }                         }                          // End of a text object, also go to a new line.                         if (bracketDepth == 0 &&                             CheckToken(new string[] { "ET" }, previousCharacters))                         {                              inTextObject = false;                             resultString += " ";                         }                         else                         {                             // Start outputting text                             if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))                             {                                 bracketDepth = 1;                             }                             else                             {                                 // Stop outputting text                                 if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))                                 {                                     bracketDepth = 0;                                 }                                 else                                 {                                     // Just a normal text character:                                     if (bracketDepth == 1)                                     {                                         // Only print out next character no matter what.                                          // Do not interpret.                                         if (c == '\\' && !nextLiteral)                                         {                                             resultString += c.ToString();                                             nextLiteral = true;                                         }                                         else                                         {                                             if (((c >= ' ') && (c <= '~')) ||                                                 ((c >= 128) && (c < 255)))                                             {                                                 resultString += c.ToString();                                             }                                              nextLiteral = false;                                         }                                     }                                 }                             }                         }                     }                      // Store the recent characters for                      // when we have to go back for a checking                     for (int j = 0; j < _numberOfCharsToKeep - 1; j++)                     {                         previousCharacters[j] = previousCharacters[j + 1];                     }                     previousCharacters[_numberOfCharsToKeep - 1] = c;                      // Start of a text object                     if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))                     {                         inTextObject = true;                     }                 }                  return CleanupContent(resultString);             }             catch             {                 return "";             }         }          private string CleanupContent(string text)         {             string[] patterns = { @"\\\(", @"\\\)", @"\\226", @"\\222", @"\\223", @"\\224", @"\\340", @"\\342", @"\\344", @"\\300", @"\\302", @"\\304", @"\\351", @"\\350", @"\\352", @"\\353", @"\\311", @"\\310", @"\\312", @"\\313", @"\\362", @"\\364", @"\\366", @"\\322", @"\\324", @"\\326", @"\\354", @"\\356", @"\\357", @"\\314", @"\\316", @"\\317", @"\\347", @"\\307", @"\\371", @"\\373", @"\\374", @"\\331", @"\\333", @"\\334", @"\\256", @"\\231", @"\\253", @"\\273", @"\\251", @"\\221"};             string[] replace = {   "(",     ")",      "-",     "'",      "\"",      "\"",    "à",      "â",      "ä",      "À",      "Â",      "Ä",      "é",      "è",      "ê",      "ë",      "É",      "È",      "Ê",      "Ë",      "ò",      "ô",      "ö",      "Ò",      "Ô",      "Ö",      "ì",      "î",      "ï",      "Ì",      "Î",      "Ï",      "ç",      "Ç",      "ù",      "û",      "ü",      "Ù",      "Û",      "Ü",      "®",      "™",      "«",      "»",      "©",      "'" };              for (int i = 0; i < patterns.Length; i++)             {                 string regExPattern = patterns[i];                 Regex regex = new Regex(regExPattern, RegexOptions.IgnoreCase);                 text = regex.Replace(text, replace[i]);             }              return text;         }          #endregion          #region CheckToken         /// <summary>         /// Check if a certain 2 character token just came along (e.g. BT)         /// </summary>         /// <param name="tokens">the searched token</param>         /// <param name="recent">the recent character array</param>         /// <returns></returns>         private bool CheckToken(string[] tokens, char[] recent)         {             foreach (string token in tokens)             {                 if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&                     (recent[_numberOfCharsToKeep - 2] == token[1]) &&                     ((recent[_numberOfCharsToKeep - 1] == ' ') ||                     (recent[_numberOfCharsToKeep - 1] == 0x0d) ||                     (recent[_numberOfCharsToKeep - 1] == 0x0a)) &&                     ((recent[_numberOfCharsToKeep - 4] == ' ') ||                     (recent[_numberOfCharsToKeep - 4] == 0x0d) ||                     (recent[_numberOfCharsToKeep - 4] == 0x0a))                     )                 {                     return true;                 }             }             return false;         }         #endregion     } } 
like image 20
ceetheman Avatar answered Sep 21 '22 21:09

ceetheman