Lire des documents PDF dans .Net

Existe-t-il une bibliothèque open source qui m’aide à lire / parsingr les documents PDF dans .Net / C #?

iTextSharp est le meilleur pari. Utilisé pour faire un spider pour lucene.Net afin qu’il puisse explorer PDF.

using System; using System.IO; using iTextSharp.text.pdf; using System.Text.RegularExpressions; namespace Spider.Utils { ///  /// Parses a PDF file and extracts the text from it. ///  public class PDFParser { /// BT = Beginning of a text object operator /// ET = End of a text object operator /// Td move to the start of next line /// 5 Ts = superscript /// -5 Ts = subscript #region Fields #region _numberOfCharsToKeep ///  /// The number of characters to keep, when extracting text. ///  private static int _numberOfCharsToKeep = 15; #endregion #endregion #region ExtractText ///  /// Extracts a text from a PDF file. ///  /// the full path to the pdf file. /// the output file name. /// the extracted text public bool ExtractText(ssortingng inFileName, ssortingng outFileName) { StreamWriter outFile = null; try { // Create a reader for the given PDF file PdfReader reader = new PdfReader(inFileName); //outFile = File.CreateText(outFileName); outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8); Console.Write("Processing: "); int totalLen = 68; float charUnit = ((float)totalLen) / (float)reader.NumberOfPages; int totalWritten = 0; float curUnit = 0; for (int page = 1; page <= reader.NumberOfPages; page++) { outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " "); // Write the progress. if (charUnit >= 1.0f) { for (int i = 0; i < (int)charUnit; i++) { Console.Write("#"); totalWritten++; } } else { curUnit += charUnit; if (curUnit >= 1.0f) { for (int i = 0; i < (int)curUnit; i++) { Console.Write("#"); totalWritten++; } curUnit = 0; } } } if (totalWritten < totalLen) { for (int i = 0; i < (totalLen - totalWritten); i++) { Console.Write("#"); } } return true; } catch { return false; } finally { if (outFile != null) outFile.Close(); } } #endregion #region ExtractTextFromPDFBytes ///  /// This method processes an uncompressed Adobe (text) object /// and extracts text. ///  /// uncompressed ///  public ssortingng ExtractTextFromPDFBytes(byte[] input) { if (input == null || input.Length == 0) return ""; try { ssortingng resultSsortingng = ""; // Flag showing if we are we currently inside a text object bool inTextObject = false; // Flag showing if the next character is literal // eg '\\' to get a '\' character or '\(' to get '(' bool nextLiteral = false; // () Bracket nesting level. Text appears inside () int bracketDepth = 0; // Keep previous chars to get extract numbers etc.: char[] previousCharacters = new char[_numberOfCharsToKeep]; for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' '; for (int i = 0; i < input.Length; i++) { char c = (char)input[i]; if (input[i] == 213) c = "'".ToCharArray()[0]; if (inTextObject) { // Position the text if (bracketDepth == 0) { if (CheckToken(new string[] { "TD", "Td" }, previousCharacters)) { resultString += "\n\r"; } else { if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters)) { resultString += "\n"; } else { if (CheckToken(new string[] { "Tj" }, previousCharacters)) { resultString += " "; } } } } // End of a text object, also go to a new line. if (bracketDepth == 0 && CheckToken(new string[] { "ET" }, previousCharacters)) { inTextObject = false; resultString += " "; } else { // Start outputting text if ((c == '(') && (bracketDepth == 0) && (!nextLiteral)) { bracketDepth = 1; } else { // Stop outputting text if ((c == ')') && (bracketDepth == 1) && (!nextLiteral)) { bracketDepth = 0; } else { // Just a normal text character: if (bracketDepth == 1) { // Only print out next character no matter what. // Do not interpret. if (c == '\\' && !nextLiteral) { resultString += c.ToString(); nextLiteral = true; } else { if (((c >= ' ') && (c <= '~')) || ((c >= 128) && (c < 255))) { resultString += c.ToString(); } nextLiteral = false; } } } } } } // Store the recent characters for // when we have to go back for a checking for (int j = 0; j < _numberOfCharsToKeep - 1; j++) { previousCharacters[j] = previousCharacters[j + 1]; } previousCharacters[_numberOfCharsToKeep - 1] = c; // Start of a text object if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters)) { inTextObject = true; } } return CleanupContent(resultString); } catch { return ""; } } private string CleanupContent(string text) { string[] patterns = { @"\\\(", @"\\\)", @"\\226", @"\\222", @"\\223", @"\\224", @"\\340", @"\\342", @"\\344", @"\\300", @"\\302", @"\\304", @"\\351", @"\\350", @"\\352", @"\\353", @"\\311", @"\\310", @"\\312", @"\\313", @"\\362", @"\\364", @"\\366", @"\\322", @"\\324", @"\\326", @"\\354", @"\\356", @"\\357", @"\\314", @"\\316", @"\\317", @"\\347", @"\\307", @"\\371", @"\\373", @"\\374", @"\\331", @"\\333", @"\\334", @"\\256", @"\\231", @"\\253", @"\\273", @"\\251", @"\\221"}; string[] replace = { "(", ")", "-", "'", "\"", "\"", "à", "â", "ä", "À", "Â", "Ä", "é", "è", "ê", "ë", "É", "È", "Ê", "Ë", "ò", "ô", "ö", "Ò", "Ô", "Ö", "ì", "î", "ï", "Ì", "Î", "Ï", "ç", "Ç", "ù", "û", "ü", "Ù", "Û", "Ü", "®", "™", "«", "»", "©", "'" }; for (int i = 0; i < patterns.Length; i++) { string regExPattern = patterns[i]; Regex regex = new Regex(regExPattern, RegexOptions.IgnoreCase); text = regex.Replace(text, replace[i]); } return text; } #endregion #region CheckToken ///  /// Check if a certain 2 character token just came along (eg BT) ///  /// the searched token /// the recent character array ///  private bool CheckToken(ssortingng[] tokens, char[] recent) { foreach (ssortingng token in tokens) { if ((recent[_numberOfCharsToKeep - 3] == token[0]) && (recent[_numberOfCharsToKeep - 2] == token[1]) && ((recent[_numberOfCharsToKeep - 1] == ' ') || (recent[_numberOfCharsToKeep - 1] == 0x0d) || (recent[_numberOfCharsToKeep - 1] == 0x0a)) && ((recent[_numberOfCharsToKeep - 4] == ' ') || (recent[_numberOfCharsToKeep - 4] == 0x0d) || (recent[_numberOfCharsToKeep - 4] == 0x0a)) ) { return true; } } return false; } #endregion } } 

Depuis la dernière réponse à cette question en 2008, iTextSharp a considérablement amélioré son API. Si vous téléchargez la dernière version de leur API à partir de http://sourceforge.net/projects/itextsharp/ , vous pouvez utiliser l’extrait de code suivant pour extraire tout le texte d’un pdf dans une chaîne.

 using iTextSharp.text.pdf; using iTextSharp.text.pdf.parser; namespace PdfParser { public static class PdfTextExtractor { public static ssortingng pdfText(ssortingng path) { PdfReader reader = new PdfReader(path); ssortingng text = ssortingng.Empty; for(int page = 1; page <= reader.NumberOfPages; page++) { text += PdfTextExtractor.GetTextFromPage(reader,page); } reader.Close(); return text; } } } 

J’ai utilisé ITextSharp dans le passé pour manipuler / diviser et réformer des documents PDF – c’est assez simple et aussi open-source.

 public ssortingng ReadPdfFile(object Filename, DataTable ReadLibray) { PdfReader reader2 = new PdfReader((ssortingng)Filename); ssortingng strText = ssortingng.Empty; for (int page = 1; page <= reader2.NumberOfPages; page++) { ITextExtractionStrategy its = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy(); PdfReader reader = new PdfReader((string)Filename); String s = PdfTextExtractor.GetTextFromPage(reader, page, its); s = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(s))); strText = strText + s; reader.Close(); } return strText; } 

PDFClown pourrait vous aider, mais je ne le recommanderais pas pour une application volumineuse ou intensive.

iText est la meilleure bibliothèque que je connaisse. Initialement écrit en Java, il existe également un port .NET.

Voir http://www.ujihara.jp/iTextdotNET/en/

aspose pdf fonctionne assez bien. encore une fois, vous devez payer pour cela

Vous pouvez regarder ceci: http://www.codeproject.com/KB/showcase/pdfrasterizer.aspx Ce n’est pas complètement gratuit, mais ça a l’air très bien.

Alex

Il y a aussi LibHaru

http://libharu.org/wiki/Main_Page

http://www.c-sharpcorner.com/UploadFile/psingh/PDFFileGenerator12062005235236PM/PDFFileGenerator.aspx est open source et peut être un bon sharepoint départ pour vous.

Jetez un coup d’œil à la bibliothèque Docotic.Pdf . Il ne vous oblige pas à ouvrir le code source de votre application (comme par exemple iTextSharp avec une licence virale AGPL 3).

Docotic.Pdf peut être utilisé pour lire des fichiers PDF et extraire du texte avec ou sans formatage. Veuillez regarder l’exemple qui montre comment extraire du texte à partir de fichiers PDF .

Disclaimer: Je travaille pour Bit Miracle, fournisseur de la bibliothèque.