Is there an open source library that will help me with reading/parsing PDF documents in .NET/C#?

iTextSharp is the best bet. Used it to make a spider for lucene.Net so that it could crawl PDF. <pre class="prettyprint"><code>using System; using System.IO; using iTextSharp.text.pdf; using System.Text.RegularExpressions; namespace Spider.Utils { /// <summary> /// Parses a PDF file and extracts the text from it. /// </summary> public class PDFParser { /// BT = Beginning of a text object operator /// ET = End of a text object operator /// Td move to the start of next line /// 5 Ts = superscript /// -5 Ts = subscript #region Fields #region _numberOfCharsToKeep /// <summary> /// The number of characters to keep, when extracting text. /// </summary> private static int _numberOfCharsToKeep = 15; #endregion #endregion #region ExtractText /// <summary> /// Extracts a text from a PDF file. /// </summary> /// <param name="inFileName">the full path to the pdf file.</param> /// <param name="outFileName">the output file name.</param> /// <returns>the extracted text</returns> public bool ExtractText(string inFileName, string outFileName) { StreamWriter outFile = null; try { // Create a reader for the given PDF file PdfReader reader = new PdfReader(inFileName); //outFile = File.CreateText(outFileName); outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8); Console.Write("Processing: "); int totalLen = 68; float charUnit = ((float)totalLen) / (float)reader.NumberOfPages; int totalWritten = 0; float curUnit = 0; for (int page = 1; page <= reader.NumberOfPages; page++) { outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " "); // Write the progress. if (charUnit >= 1.0f) { for (int i = 0; i < (int)charUnit; i++) { Console.Write("#"); totalWritten++; } } else { curUnit += charUnit; if (curUnit >= 1.0f) { for (int i = 0; i < (int)curUnit; i++) { Console.Write("#"); totalWritten++; } curUnit = 0; } } } if (totalWritten < totalLen) { for (int i = 0; i < (totalLen - totalWritten); i++) { Console.Write("#"); } } return true; } catch { return false; } finally { if (outFile != null) outFile.Close(); } } #endregion #region ExtractTextFromPDFBytes /// <summary> /// This method processes an uncompressed Adobe (text) object /// and extracts text. /// </summary> /// <param name="input">uncompressed</param> /// <returns></returns> public string ExtractTextFromPDFBytes(byte[] input) { if (input == null || input.Length == 0) return ""; try { string resultString = ""; // Flag showing if we are we currently inside a text object bool inTextObject = false; // Flag showing if the next character is literal // e.g. '\\' to get a '\' character or '\(' to get '(' bool nextLiteral = false; // () Bracket nesting level. Text appears inside () int bracketDepth = 0; // Keep previous chars to get extract numbers etc.: char[] previousCharacters = new char[_numberOfCharsToKeep]; for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' '; for (int i = 0; i < input.Length; i++) { char c = (char)input[i]; if (input[i] == 213) c = "'".ToCharArray()[0]; if (inTextObject) { // Position the text if (bracketDepth == 0) { if (CheckToken(new string[] { "TD", "Td" }, previousCharacters)) { resultString += "\n\r"; } else { if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters)) { resultString += "\n"; } else { if (CheckToken(new string[] { "Tj" }, previousCharacters)) { resultString += " "; } } } } // End of a text object, also go to a new line. if (bracketDepth == 0 && CheckToken(new string[] { "ET" }, previousCharacters)) { inTextObject = false; resultString += " "; } else { // Start outputting text if ((c == '(') && (bracketDepth == 0) && (!nextLiteral)) { bracketDepth = 1; } else { // Stop outputting text if ((c == ')') && (bracketDepth == 1) && (!nextLiteral)) { bracketDepth = 0; } else { // Just a normal text character: if (bracketDepth == 1) { // Only print out next character no matter what. // Do not interpret. if (c == '\\' && !nextLiteral) { resultString += c.ToString(); nextLiteral = true; } else { if (((c >= ' ') && (c <= '~')) || ((c >= 128) && (c < 255))) { resultString += c.ToString(); } nextLiteral = false; } } } } } } // Store the recent characters for // when we have to go back for a checking for (int j = 0; j < _numberOfCharsToKeep - 1; j++) { previousCharacters[j] = previousCharacters[j + 1]; } previousCharacters[_numberOfCharsToKeep - 1] = c; // Start of a text object if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters)) { inTextObject = true; } } return CleanupContent(resultString); } catch { return ""; } } private string CleanupContent(string text) { string[] patterns = { @"\\\(", @"\\\)", @"\\226", @"\\222", @"\\223", @"\\224", @"\\340", @"\\342", @"\\344", @"\\300", @"\\302", @"\\304", @"\\351", @"\\350", @"\\352", @"\\353", @"\\311", @"\\310", @"\\312", @"\\313", @"\\362", @"\\364", @"\\366", @"\\322", @"\\324", @"\\326", @"\\354", @"\\356", @"\\357", @"\\314", @"\\316", @"\\317", @"\\347", @"\\307", @"\\371", @"\\373", @"\\374", @"\\331", @"\\333", @"\\334", @"\\256", @"\\231", @"\\253", @"\\273", @"\\251", @"\\221"}; string[] replace = { "(", ")", "-", "'", "\"", "\"", "à", "â", "ä", "À", "Â", "Ä", "é", "è", "ê", "ë", "É", "È", "Ê", "Ë", "ò", "ô", "ö", "Ò", "Ô", "Ö", "ì", "î", "ï", "Ì", "Î", "Ï", "ç", "Ç", "ù", "û", "ü", "Ù", "Û", "Ü", "®", "™", "«", "»", "©", "'" }; for (int i = 0; i < patterns.Length; i++) { string regExPattern = patterns[i]; Regex regex = new Regex(regExPattern, RegexOptions.IgnoreCase); text = regex.Replace(text, replace[i]); } return text; } #endregion #region CheckToken /// <summary> /// Check if a certain 2 character token just came along (e.g. BT) /// </summary> /// <param name="tokens">the searched token</param> /// <param name="recent">the recent character array</param> /// <returns></returns> private bool CheckToken(string[] tokens, char[] recent) { foreach (string token in tokens) { if ((recent[_numberOfCharsToKeep - 3] == token[0]) && (recent[_numberOfCharsToKeep - 2] == token[1]) && ((recent[_numberOfCharsToKeep - 1] == ' ') || (recent[_numberOfCharsToKeep - 1] == 0x0d) || (recent[_numberOfCharsToKeep - 1] == 0x0a)) && ((recent[_numberOfCharsToKeep - 4] == ' ') || (recent[_numberOfCharsToKeep - 4] == 0x0d) || (recent[_numberOfCharsToKeep - 4] == 0x0a)) ) { return true; } } return false; } #endregion } } </code></pre>

Reading PDF documents in .Net [closed]

2 Answers

Since this question was last answered in 2008, iTextSharp has improved their api dramatically. If you download the latest version of their api from http://sourceforge.net/projects/itextsharp/, you can use the following snippet of code to extract all text from a pdf into a string.

using iTextSharp.text.pdf; using iTextSharp.text.pdf.parser;  namespace PdfParser {     public static class PdfTextExtractor     {         public static string pdfText(string path)         {             PdfReader reader = new PdfReader(path);             string text = string.Empty;             for(int page = 1; page <= reader.NumberOfPages; page++)             {                 text += PdfTextExtractor.GetTextFromPage(reader,page);             }             reader.Close();             return text;         }        } }

159

answered Sep 21 '22 21:09

Brock Nusser

iTextSharp is the best bet. Used it to make a spider for lucene.Net so that it could crawl PDF.

using System; using System.IO; using iTextSharp.text.pdf; using System.Text.RegularExpressions;  namespace Spider.Utils {     /// <summary>     /// Parses a PDF file and extracts the text from it.     /// </summary>     public class PDFParser     {         /// BT = Beginning of a text object operator          /// ET = End of a text object operator         /// Td move to the start of next line         ///  5 Ts = superscript         /// -5 Ts = subscript          #region Fields          #region _numberOfCharsToKeep         /// <summary>         /// The number of characters to keep, when extracting text.         /// </summary>         private static int _numberOfCharsToKeep = 15;         #endregion          #endregion          #region ExtractText         /// <summary>         /// Extracts a text from a PDF file.         /// </summary>         /// <param name="inFileName">the full path to the pdf file.</param>         /// <param name="outFileName">the output file name.</param>         /// <returns>the extracted text</returns>         public bool ExtractText(string inFileName, string outFileName)         {             StreamWriter outFile = null;             try             {                 // Create a reader for the given PDF file                 PdfReader reader = new PdfReader(inFileName);                 //outFile = File.CreateText(outFileName);                 outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8);                  Console.Write("Processing: ");                  int totalLen = 68;                 float charUnit = ((float)totalLen) / (float)reader.NumberOfPages;                 int totalWritten = 0;                 float curUnit = 0;                  for (int page = 1; page <= reader.NumberOfPages; page++)                 {                     outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " ");                      // Write the progress.                     if (charUnit >= 1.0f)                     {                         for (int i = 0; i < (int)charUnit; i++)                         {                             Console.Write("#");                             totalWritten++;                         }                     }                     else                     {                         curUnit += charUnit;                         if (curUnit >= 1.0f)                         {                             for (int i = 0; i < (int)curUnit; i++)                             {                                 Console.Write("#");                                 totalWritten++;                             }                             curUnit = 0;                         }                      }                 }                  if (totalWritten < totalLen)                 {                     for (int i = 0; i < (totalLen - totalWritten); i++)                     {                         Console.Write("#");                     }                 }                 return true;             }             catch             {                 return false;             }             finally             {                 if (outFile != null) outFile.Close();             }         }         #endregion          #region ExtractTextFromPDFBytes         /// <summary>         /// This method processes an uncompressed Adobe (text) object          /// and extracts text.         /// </summary>         /// <param name="input">uncompressed</param>         /// <returns></returns>         public string ExtractTextFromPDFBytes(byte[] input)         {             if (input == null || input.Length == 0) return "";              try             {                 string resultString = "";                  // Flag showing if we are we currently inside a text object                 bool inTextObject = false;                  // Flag showing if the next character is literal                  // e.g. '\\' to get a '\' character or '\(' to get '('                 bool nextLiteral = false;                  // () Bracket nesting level. Text appears inside ()                 int bracketDepth = 0;                  // Keep previous chars to get extract numbers etc.:                 char[] previousCharacters = new char[_numberOfCharsToKeep];                 for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';                   for (int i = 0; i < input.Length; i++)                 {                     char c = (char)input[i];                     if (input[i] == 213)                         c = "'".ToCharArray()[0];                      if (inTextObject)                     {                         // Position the text                         if (bracketDepth == 0)                         {                             if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))                             {                                 resultString += "\n\r";                             }                             else                             {                                 if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters))                                 {                                     resultString += "\n";                                 }                                 else                                 {                                     if (CheckToken(new string[] { "Tj" }, previousCharacters))                                     {                                         resultString += " ";                                     }                                 }                             }                         }                          // End of a text object, also go to a new line.                         if (bracketDepth == 0 &&                             CheckToken(new string[] { "ET" }, previousCharacters))                         {                              inTextObject = false;                             resultString += " ";                         }                         else                         {                             // Start outputting text                             if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))                             {                                 bracketDepth = 1;                             }                             else                             {                                 // Stop outputting text                                 if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))                                 {                                     bracketDepth = 0;                                 }                                 else                                 {                                     // Just a normal text character:                                     if (bracketDepth == 1)                                     {                                         // Only print out next character no matter what.                                          // Do not interpret.                                         if (c == '\\' && !nextLiteral)                                         {                                             resultString += c.ToString();                                             nextLiteral = true;                                         }                                         else                                         {                                             if (((c >= ' ') && (c <= '~')) ||                                                 ((c >= 128) && (c < 255)))                                             {                                                 resultString += c.ToString();                                             }                                              nextLiteral = false;                                         }                                     }                                 }                             }                         }                     }                      // Store the recent characters for                      // when we have to go back for a checking                     for (int j = 0; j < _numberOfCharsToKeep - 1; j++)                     {                         previousCharacters[j] = previousCharacters[j + 1];                     }                     previousCharacters[_numberOfCharsToKeep - 1] = c;                      // Start of a text object                     if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))                     {                         inTextObject = true;                     }                 }                  return CleanupContent(resultString);             }             catch             {                 return "";             }         }          private string CleanupContent(string text)         {             string[] patterns = { @"\\\(", @"\\\)", @"\\226", @"\\222", @"\\223", @"\\224", @"\\340", @"\\342", @"\\344", @"\\300", @"\\302", @"\\304", @"\\351", @"\\350", @"\\352", @"\\353", @"\\311", @"\\310", @"\\312", @"\\313", @"\\362", @"\\364", @"\\366", @"\\322", @"\\324", @"\\326", @"\\354", @"\\356", @"\\357", @"\\314", @"\\316", @"\\317", @"\\347", @"\\307", @"\\371", @"\\373", @"\\374", @"\\331", @"\\333", @"\\334", @"\\256", @"\\231", @"\\253", @"\\273", @"\\251", @"\\221"};             string[] replace = {   "(",     ")",      "-",     "'",      "\"",      "\"",    "à",      "â",      "ä",      "À",      "Â",      "Ä",      "é",      "è",      "ê",      "ë",      "É",      "È",      "Ê",      "Ë",      "ò",      "ô",      "ö",      "Ò",      "Ô",      "Ö",      "ì",      "î",      "ï",      "Ì",      "Î",      "Ï",      "ç",      "Ç",      "ù",      "û",      "ü",      "Ù",      "Û",      "Ü",      "®",      "™",      "«",      "»",      "©",      "'" };              for (int i = 0; i < patterns.Length; i++)             {                 string regExPattern = patterns[i];                 Regex regex = new Regex(regExPattern, RegexOptions.IgnoreCase);                 text = regex.Replace(text, replace[i]);             }              return text;         }          #endregion          #region CheckToken         /// <summary>         /// Check if a certain 2 character token just came along (e.g. BT)         /// </summary>         /// <param name="tokens">the searched token</param>         /// <param name="recent">the recent character array</param>         /// <returns></returns>         private bool CheckToken(string[] tokens, char[] recent)         {             foreach (string token in tokens)             {                 if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&                     (recent[_numberOfCharsToKeep - 2] == token[1]) &&                     ((recent[_numberOfCharsToKeep - 1] == ' ') ||                     (recent[_numberOfCharsToKeep - 1] == 0x0d) ||                     (recent[_numberOfCharsToKeep - 1] == 0x0a)) &&                     ((recent[_numberOfCharsToKeep - 4] == ' ') ||                     (recent[_numberOfCharsToKeep - 4] == 0x0d) ||                     (recent[_numberOfCharsToKeep - 4] == 0x0a))                     )                 {                     return true;                 }             }             return false;         }         #endregion     } }

answered Sep 21 '22 21:09

ceetheman

Related questions
                            
                                What is the major use of MarshalByRefObject?
                            
                                Capture console exit C#
                            
                                Loader lock error
                            
                                ASP.NET MVC Razor: How to render a Razor Partial View's HTML inside the controller action
                            
                                SignalR Console app example
                            
                                Where is the "Fold" LINQ Extension Method?
                            
                                How to send email in ASP.NET C#
                            
                                ThreadStatic v.s. ThreadLocal<T>: is generic better than attribute?
                            
                                Where is NuGet.Config file located in Visual Studio project?
                            
                                Unable to apply publish properties for item X
                            
                                Lazy Loading vs Eager Loading
                            
                                .NET Out Of Memory Exception - Used 1.3GB but have 16GB installed
                            
                                How to add browse file button to Windows Form using C#
                            
                                How can I safely convert a byte array into a string and back? [duplicate]
                            
                                obtain generic enumerator from an array
                            
                                How to store int[] array in application Settings
                            
                                Monitor vs lock
                            
                                How do I check "no exception occurred" in my MSTest unit test?
                            
                                Can't change target platform to "any CPU"
                            
                                Add custom header in HttpWebRequest

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With

Reading PDF documents in .Net [closed]

Tags:

c#

.net

pdf

JRoppert

People also ask

2 Answers

Brock Nusser

ceetheman

Recent Activity

Donate For Us