PDFsharp & MigraDoc Foundation :: View topic

public static String extractSSN(final PDPage page) throws IOException
{
   // Stripper object.
   final PDFTextStripperByArea stripper = new PDFTextStripperByArea();
   stripper.setSortByPosition(true);

   // Set the area to search on the PDF.
   final Rectangle searchArea = new Rectangle(0, 130, 100, 10);
   stripper.addRegion("ssn", searchArea);

   // Extract the text from the area, then pluck the ssn from it.
   stripper.extractRegions(page);
   String text = stripper.getTextForRegion("ssn").replaceAll("c", "%");
   text = URLDecoder.decode(text, "UTF-8");

   // Return the portion of the string we need.
   String output = "";

   try
   {
      output = text.substring(18, 29);
   }
   catch (final Exception ex)
   {
      output = "EMPTY";
   }

   return output;
}

using System;
using System.Diagnostics;
using System.IO;
using PdfSharp.Drawing;
using PdfSharp.Pdf;
using PdfSharp.Pdf.IO;

namespace SearchAndReplace
{
class Program
{
static void Main(string[] args)
{

// Open an existing document for editing and draw on first page
PdfDocument document = PdfReader.Open("HelloWorld.pdf");

int PageCount = document.Pages.Count;
PdfPage page = null;

for (int j = 0; j < PageCount; j++)
{
page = document.Pages[j];
//What action do I need to perform on a page to get the text?
}

}
}
}

public PDFParser()
{
var streamWriter = new StreamWriter("output.txt", false);

String outputText = "";

try
{
PdfDocument inputDocument = PdfReader.Open("input.pdf", PdfDocumentOpenMode.ReadOnly);

foreach (PdfPage page in inputDocument.Pages)
{
for (int index = 0; index < page.Contents.Elements.Count; index++)
{
PdfDictionary.PdfStream stream = page.Contents.Elements.GetDictionary(index).Stream;
outputText = new PDFTextExtractor().ExtractTextFromPDFBytes(stream.Value);

streamWriter.WriteLine(outputText);
}
}

}
catch (Exception e)
{

}
streamWriter.Close();
}

public class PDFTextExtractor
{
/// BT = Beginning of a text object operator
/// ET = End of a text object operator
/// Td move to the start of next line
/// 5 Ts = superscript
/// -5 Ts = subscript

#region Fields

#region _numberOfCharsToKeep
/// <summary>
/// The number of characters to keep, when extracting text.
/// </summary>
private static int _numberOfCharsToKeep = 15;
#endregion

#endregion

#region ExtractTextFromPDFBytes
/// <summary>
/// This method processes an uncompressed Adobe (text) object
/// and extracts text.
/// </summary>
/// <param name="input">uncompressed</param>
/// <returns></returns>
public string ExtractTextFromPDFBytes(byte[] input)
{
if (input == null || input.Length == 0) return "";

try
{
string resultString = "";

// Flag showing if we are we currently inside a text object
bool inTextObject = false;

// Flag showing if the next character is literal
// e.g. '\\' to get a '\' character or '\(' to get '('
bool nextLiteral = false;

// () Bracket nesting level. Text appears inside ()
int bracketDepth = 0;

// Keep previous chars to get extract numbers etc.:
char[] previousCharacters = new char[_numberOfCharsToKeep];
for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';

for (int i = 0; i < input.Length; i++)
{
char c = (char)input[i];

if (inTextObject)
{
// Position the text
if (bracketDepth == 0)
{
if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
{
resultString += "\n\r";
}
else
{
if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters))
{
resultString += "\n";
}
else
{
if (CheckToken(new string[] { "Tj" }, previousCharacters))
{
resultString += " ";
}
}
}
}

// End of a text object, also go to a new line.
if (bracketDepth == 0 &&
CheckToken(new string[] { "ET" }, previousCharacters))
{

inTextObject = false;
resultString += " ";
}
else
{
// Start outputting text
if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
{
bracketDepth = 1;
}
else
{
// Stop outputting text
if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
{
bracketDepth = 0;
}
else
{
// Just a normal text character:
if (bracketDepth == 1)
{
// Only print out next character no matter what.
// Do not interpret.
if (c == '\\' && !nextLiteral)
{
nextLiteral = true;
}
else
{
if (((c >= ' ') && (c <= '~')) ||
((c >= 128) && (c < 255)))
{
resultString += c.ToString();
}

nextLiteral = false;
}
}
}
}
}
}

// Store the recent characters for
// when we have to go back for a checking
for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
{
previousCharacters[j] = previousCharacters[j + 1];
}
previousCharacters[_numberOfCharsToKeep - 1] = c;

// Start of a text object
if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
{
inTextObject = true;
}
}
return resultString;
}
catch
{
return "";
}
}
#endregion

#region CheckToken
/// <summary>
/// Check if a certain 2 character token just came along (e.g. BT)
/// </summary>
/// <param name="search">the searched token</param>
/// <param name="recent">the recent character array</param>
/// <returns></returns>
private bool CheckToken(string[] tokens, char[] recent)
{
foreach (string token in tokens)
{
if (token.Length > 1)
{
if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
(recent[_numberOfCharsToKeep - 2] == token[1]) &&
((recent[_numberOfCharsToKeep - 1] == ' ') ||
(recent[_numberOfCharsToKeep - 1] == 0x0d) ||
(recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
((recent[_numberOfCharsToKeep - 4] == ' ') ||
(recent[_numberOfCharsToKeep - 4] == 0x0d) ||
(recent[_numberOfCharsToKeep - 4] == 0x0a))
)
{
return true;
}
}
else
{
return false;
}

}
return false;
}
#endregion
}

private void RemoveUnReferencedPages(PdfDocument document, string referenceString)
{
// this procedure removes any pages from the pdf document that do not contain
// the reference string

int pageNo = -1;
string strStreamValue;
byte[] streamValue;
bool[] keepPageArray;
keepPageArray = new bool[document.Pages.Count];

// iterate through the pages
foreach (PdfPage page in document.Pages)
{
pageNo++;
strStreamValue = "";

// put the stream value for every element on the page in a string variable.
for (int i = 0; i < page.Contents.Elements.Count; i++)
{
PdfDictionary.PdfStream stream = page.Contents.Elements.GetDictionary(i).Stream;
streamValue = stream.Value;
foreach (byte b in streamValue)
{
strStreamValue += (char)b;
}
}
// flag those pages that contain the reference value
keepPageArray[pageNo] = strStreamValue.Contains(referenceString);
}

// Now, remove the pages we identified. We're doing this in reverse order
// because the deletion of an earlier page moves the rest of the pages up
// on page. This keeps us from deleting the wrong pages.
for (int i = keepPageArray.Length - 1; i > -1; i--)
{
if (!keepPageArray[i])
{
PdfPage deletePage = document.Pages[i];
document.Pages.Remove(deletePage);
}
}
}

Author:	ldd [ Wed Oct 20, 2010 3:43 pm ]
Post subject:	Search text in a PDF
Hi, Is it possible to search some text in a PDF? Thanks

Author:	ldd [ Thu Oct 28, 2010 8:01 am ]
Post subject:	Re: Search text in a PDF
There are no answers so I assume that it is not possible.

Author:	Thomas Hoevel [ Thu Oct 28, 2010 9:01 am ]
Post subject:	Re: Search text in a PDF
ldd wrote: Is it possible to search some text in a PDF? Yes, it's possible. But you must extract the text yourself, there is no Search function in PDFsharp. Improving the functions for text extraction from PDF files is on our wish list

Author:	ldd [ Thu Oct 28, 2010 1:12 pm ]
Post subject:	Re: Search text in a PDF
Thank you

Author:	aaron.walker [ Tue Nov 02, 2010 2:51 pm ]
Post subject:	Re: Search text in a PDF
Right now we use PDFBox to extract text. They have something called PDFTextStripperByArea where you can provide a searchArea and it will get the text from that area provided. Does PDFSharp have something like that where I can get the text myself? Here is a part of the PDFBox code: Code: public static String extractSSN(final PDPage page) throws IOException { // Stripper object. final PDFTextStripperByArea stripper = new PDFTextStripperByArea(); stripper.setSortByPosition(true); // Set the area to search on the PDF. final Rectangle searchArea = new Rectangle(0, 130, 100, 10); stripper.addRegion("ssn", searchArea); // Extract the text from the area, then pluck the ssn from it. stripper.extractRegions(page); String text = stripper.getTextForRegion("ssn").replaceAll("c", "%"); text = URLDecoder.decode(text, "UTF-8"); // Return the portion of the string we need. String output = ""; try { output = text.substring(18, 29); } catch (final Exception ex) { output = "EMPTY"; } return output; }

PDFsharp & MigraDoc Foundation https://forum.pdfsharp.net/

Search text in a PDF https://forum.pdfsharp.net/viewtopic.php?f=2&t=1382	Page 1 of 1

Author:	craiggers [ Fri Nov 26, 2010 5:52 am ]
Post subject:	Re: Search text in a PDF
Thomas Hoevel wrote: Yes, it's possible. But you must extract the text yourself, there is no Search function in PDFsharp. Hi Thomas. Please could you be as kind as to give us a few pointers on how to search for text in PDF documents. I would be most grateful for some direction on how to go about reading the text in pdf files using the pdfsharp library. How would I extract the text myself Many thanks in anticipation. Regards Craig Gers Code: using System; using System.Diagnostics; using System.IO; using PdfSharp.Drawing; using PdfSharp.Pdf; using PdfSharp.Pdf.IO; namespace SearchAndReplace { class Program { static void Main(string[] args) { // Open an existing document for editing and draw on first page PdfDocument document = PdfReader.Open("HelloWorld.pdf"); int PageCount = document.Pages.Count; PdfPage page = null; for (int j = 0; j < PageCount; j++) { page = document.Pages[j]; //What action do I need to perform on a page to get the text? } } } }

Author:	filip [ Mon Nov 29, 2010 6:25 am ]
Post subject:	Re: Search text in a PDF
Hi Craig, I have just started using PdfSharp so I'm learning as well. I wrote this method to extract text from a pdf, maybe it will help you. It uses a class called PDFTextExtractor which I found online, I have attached the class to this post. Code: public PDFParser() { var streamWriter = new StreamWriter("output.txt", false); String outputText = ""; try { PdfDocument inputDocument = PdfReader.Open("input.pdf", PdfDocumentOpenMode.ReadOnly); foreach (PdfPage page in inputDocument.Pages) { for (int index = 0; index < page.Contents.Elements.Count; index++) { PdfDictionary.PdfStream stream = page.Contents.Elements.GetDictionary(index).Stream; outputText = new PDFTextExtractor().ExtractTextFromPDFBytes(stream.Value); streamWriter.WriteLine(outputText); } } } catch (Exception e) { } streamWriter.Close(); } This worked on the pdfs I had to test with. If you need to search the text you can look into Regular Expression or use the methods of the string class in .net. Good luck, Filip

Author:	filip [ Mon Nov 29, 2010 6:27 am ]
Post subject:	Re: Search text in a PDF
OK, I dont think my attachment worked. Here is the class PDFTextExtractor: Code: public class PDFTextExtractor { /// BT = Beginning of a text object operator /// ET = End of a text object operator /// Td move to the start of next line /// 5 Ts = superscript /// -5 Ts = subscript #region Fields #region _numberOfCharsToKeep /// <summary> /// The number of characters to keep, when extracting text. /// </summary> private static int _numberOfCharsToKeep = 15; #endregion #endregion #region ExtractTextFromPDFBytes /// <summary> /// This method processes an uncompressed Adobe (text) object /// and extracts text. /// </summary> /// <param name="input">uncompressed</param> /// <returns></returns> public string ExtractTextFromPDFBytes(byte[] input) { if (input == null \|\| input.Length == 0) return ""; try { string resultString = ""; // Flag showing if we are we currently inside a text object bool inTextObject = false; // Flag showing if the next character is literal // e.g. '\\' to get a '\' character or '\(' to get '(' bool nextLiteral = false; // () Bracket nesting level. Text appears inside () int bracketDepth = 0; // Keep previous chars to get extract numbers etc.: char[] previousCharacters = new char[_numberOfCharsToKeep]; for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' '; for (int i = 0; i < input.Length; i++) { char c = (char)input[i]; if (inTextObject) { // Position the text if (bracketDepth == 0) { if (CheckToken(new string[] { "TD", "Td" }, previousCharacters)) { resultString += "\n\r"; } else { if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters)) { resultString += "\n"; } else { if (CheckToken(new string[] { "Tj" }, previousCharacters)) { resultString += " "; } } } } // End of a text object, also go to a new line. if (bracketDepth == 0 && CheckToken(new string[] { "ET" }, previousCharacters)) { inTextObject = false; resultString += " "; } else { // Start outputting text if ((c == '(') && (bracketDepth == 0) && (!nextLiteral)) { bracketDepth = 1; } else { // Stop outputting text if ((c == ')') && (bracketDepth == 1) && (!nextLiteral)) { bracketDepth = 0; } else { // Just a normal text character: if (bracketDepth == 1) { // Only print out next character no matter what. // Do not interpret. if (c == '\\' && !nextLiteral) { nextLiteral = true; } else { if (((c >= ' ') && (c <= '~')) \|\| ((c >= 128) && (c < 255))) { resultString += c.ToString(); } nextLiteral = false; } } } } } } // Store the recent characters for // when we have to go back for a checking for (int j = 0; j < _numberOfCharsToKeep - 1; j++) { previousCharacters[j] = previousCharacters[j + 1]; } previousCharacters[_numberOfCharsToKeep - 1] = c; // Start of a text object if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters)) { inTextObject = true; } } return resultString; } catch { return ""; } } #endregion #region CheckToken /// <summary> /// Check if a certain 2 character token just came along (e.g. BT) /// </summary> /// <param name="search">the searched token</param> /// <param name="recent">the recent character array</param> /// <returns></returns> private bool CheckToken(string[] tokens, char[] recent) { foreach (string token in tokens) { if (token.Length > 1) { if ((recent[_numberOfCharsToKeep - 3] == token[0]) && (recent[_numberOfCharsToKeep - 2] == token[1]) && ((recent[_numberOfCharsToKeep - 1] == ' ') \|\| (recent[_numberOfCharsToKeep - 1] == 0x0d) \|\| (recent[_numberOfCharsToKeep - 1] == 0x0a)) && ((recent[_numberOfCharsToKeep - 4] == ' ') \|\| (recent[_numberOfCharsToKeep - 4] == 0x0d) \|\| (recent[_numberOfCharsToKeep - 4] == 0x0a)) ) { return true; } } else { return false; } } return false; } #endregion }

Author:	davidedm [ Fri Jan 28, 2011 4:24 pm ]
Post subject:	Re: Search text in a PDF
I created a pdf file with openoffice and then I tried your class but it's return only new line or carriage return. Do you know why. Thanks in advance

Author:	jjcaleiscool [ Fri Feb 18, 2011 6:19 pm ]
Post subject:	Re: Search text in a PDF
I've had the same problem here - I created a PDF in OpenOffice with some sample text, but only get empty strings back. Were you able to make any progress with this? Thanks!

Author:	gregwilkerson [ Sun Mar 20, 2011 4:58 pm ]
Post subject:	Re: Search text in a PDF
Here's what I did. With this solution, all I was interested in was removing pages that did not have a user specified string in them. I did not care where the string was placed, simply if it existed. Code: private void RemoveUnReferencedPages(PdfDocument document, string referenceString) { // this procedure removes any pages from the pdf document that do not contain // the reference string int pageNo = -1; string strStreamValue; byte[] streamValue; bool[] keepPageArray; keepPageArray = new bool[document.Pages.Count]; // iterate through the pages foreach (PdfPage page in document.Pages) { pageNo++; strStreamValue = ""; // put the stream value for every element on the page in a string variable. for (int i = 0; i < page.Contents.Elements.Count; i++) { PdfDictionary.PdfStream stream = page.Contents.Elements.GetDictionary(i).Stream; streamValue = stream.Value; foreach (byte b in streamValue) { strStreamValue += (char)b; } } // flag those pages that contain the reference value keepPageArray[pageNo] = strStreamValue.Contains(referenceString); } // Now, remove the pages we identified. We're doing this in reverse order // because the deletion of an earlier page moves the rest of the pages up // on page. This keeps us from deleting the wrong pages. for (int i = keepPageArray.Length - 1; i > -1; i--) { if (!keepPageArray[i]) { PdfPage deletePage = document.Pages[i]; document.Pages.Remove(deletePage); } } }

Page 1 of 1	All times are UTC
Powered by phpBB® Forum Software © phpBB Group https://www.phpbb.com/