PDFsharp & MigraDoc Foundation :: View topic - How to read text from a pdf with 2 stremas

public class PDFTextExtractor
{
/// BT = Beginning of a text object operator
/// ET = End of a text object operator
/// Td move to the start of next line
/// 5 Ts = superscript
/// -5 Ts = subscript

#region Fields

#region _numberOfCharsToKeep
/// <summary>
/// The number of characters to keep, when extracting text.
/// </summary>
private static int _numberOfCharsToKeep = 15;
#endregion

#endregion

#region ExtractTextFromPDFBytes
/// <summary>
/// This method processes an uncompressed Adobe (text) object
/// and extracts text.
/// </summary>
/// <param name="input">uncompressed</param>
/// <returns></returns>
public string ExtractTextFromPDFBytes(byte[] input)
{
if (input == null || input.Length <= 100) return "";

try
{
string resultString = "";

// Flag showing if we are we currently inside a text object
bool inTextObject = false;

// Flag showing if the next character is literal
// e.g. '\\' to get a '\' character or '\(' to get '('
bool nextLiteral = false;

// () Bracket nesting level. Text appears inside ()
int bracketDepth = 0;

// Keep previous chars to get extract numbers etc.:
char[] previousCharacters = new char[_numberOfCharsToKeep];
for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';

for (int i = 0; i < input.Length; i++)
{
char c = (char)input[i];

if (inTextObject)
{
// Position the text
if (bracketDepth == 0)
{
if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
{
resultString += "\n\r";
}
else
{
if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters))
{
resultString += "\n";
}
else
{
if (CheckToken(new string[] { "Tj" }, previousCharacters))
{
resultString += " ";
}
}
}
}

// End of a text object, also go to a new line.
if (bracketDepth == 0 &&
CheckToken(new string[] { "ET" }, previousCharacters))
{

inTextObject = false;
resultString += " ";
}
else
{
// Start outputting text
if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
{
bracketDepth = 1;
}
else
{
// Stop outputting text
if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
{
bracketDepth = 0;
}
else
{
// Just a normal text character:
if (bracketDepth == 1)
{
// Only print out next character no matter what.
// Do not interpret.
if (c == '\\' && !nextLiteral)
{
nextLiteral = true;
}
else
{
if (((c >= ' ') && (c <= '~')) ||
((c >= 128) && (c < 255)))
{
resultString += c.ToString();
}

nextLiteral = false;
}
}
}
}
}
}

// Store the recent characters for
// when we have to go back for a checking
for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
{
previousCharacters[j] = previousCharacters[j + 1];
}
previousCharacters[_numberOfCharsToKeep - 1] = c;

// Start of a text object
if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
{
inTextObject = true;
}
}
return resultString;
}
catch
{
return "";
}
}
#endregion

#region CheckToken
/// <summary>
/// Check if a certain 2 character token just came along (e.g. BT)
/// </summary>
/// <param name="search">the searched token</param>
/// <param name="recent">the recent character array</param>
/// <returns></returns>
private bool CheckToken(string[] tokens, char[] recent)
{
foreach (string token in tokens)
{
if (token.Length > 1)
{
if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
(recent[_numberOfCharsToKeep - 2] == token[1]) &&
((recent[_numberOfCharsToKeep - 1] == ' ') ||
(recent[_numberOfCharsToKeep - 1] == 0x0d) ||
(recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
((recent[_numberOfCharsToKeep - 4] == ' ') ||
(recent[_numberOfCharsToKeep - 4] == 0x0d) ||
(recent[_numberOfCharsToKeep - 4] == 0x0a))
)
{
return true;
}
}
else
{
return false;
}

}
return false;
}
#endregion
}

Author:	dolo33 [ Thu Aug 25, 2016 12:21 pm ]
Post subject:	How to read text from a pdf with 2 stremas
Hi everyone, I need to read and search the text from a PDF in C#. It works fine with a PDF with a single stream in it but fails with 2 streams in a pdf (see below for my PDF). I can read only the first stream and the info i need is in the second one... I tried 2 methods i found on this forum to get the text (ContentReader.ReadContent and page.Contents.Elements.GetDictionary(index).Stream) but both only read the first stream. Is it possible to do that with PDFSharp or should I use another library (and which one, PDFBox ?) ? Thanks in advance %PDF-1.4 %àáâã 3 0 obj <</Length 29 /Filter /FlateDecode >> stream xœ+T0T0 B™œ« ‘f¨à’¯È NÐ endstream endobj 4 0 obj <</Parent 1 0 R /Contents 3 0 R /Type /Page /Resources <</ProcSet [/PDF /ImageC] /XObject <</Xf1 2 0 R >> >> /MediaBox [0 0 595 842] >> endobj 2 0 obj <</Type /XObject /Resources <</ProcSet [/PDF /ImageC /Text] /Font <</F9 5 0 R /F8 6 0 R /F7 7 0 R /F1 8 0 R /F2 9 0 R /F10 10 0 R /F3 11 0 R /F11 12 0 R /F4 13 0 R /F5 14 0 R /F6 15 0 R /F14 16 0 R /F12 17 0 R /F13 18 0 R >> >> /Subtype /Form /BBox [0 0 596 843] /Matrix [1 0 0 1 0 0] /Length 8346 /FormType 1 /Filter [/ASCII85Decode /FlateDecode] >> stream Gau0I ...

Author:	Thomas Hoevel [ Thu Aug 25, 2016 12:27 pm ]
Post subject:	Re: How to read text from a pdf with 2 stremas
Hi! Are you referring to "/Filter [/ASCII85Decode /FlateDecode]"? This is one stream, but it has two filters - and both filters must be applied one after the other to decode the contents. You don't show any code. Maybe support for two filters must be added somewhere - in PDFsharp or in your code.

Author:	dolo33 [ Thu Aug 25, 2016 1:34 pm ]
Post subject:	Re: How to read text from a pdf with 2 stremas
Thomas Hoevel wrote: Hi! Are you referring to "/Filter [/ASCII85Decode /FlateDecode]"? This is one stream, but it has two filters - and both filters must be applied one after the other to decode the contents. You don't show any code. Maybe support for two filters must be added somewhere - in PDFsharp or in your code. Here is the code : Code: string ParsePdfAsText(PdfDocument pdf) { string outputText = ""; foreach (PdfPage page in pdf.Pages) { for (int index = 0; index < page.Contents.Elements.Count; index++) { PdfDictionary.PdfStream stream = page.Contents.Elements.GetDictionary(index).Stream; outputText += new PDFTextExtractor().ExtractTextFromPDFBytes(stream.Value) + Environment.NewLine; } } return outputText; } Code for ExtractTextFromPDFBytes is below. In the stream variable i get the first stream correctly decoded First stream : stream xœ+T0T0 B™œ« ‘f¨à’¯È NÐ endstream and in the stream variable : {q 1 0 0 1 0 0 cm /Xf1 Do Q} I don't think it's a filter problem because I don't even know how to access the second stream ??? I think page.Contents.Elements.Count should be 2 because I have two objects with a stream in my PDF but it's only 1 ? I didn't write this code and i'm trying to figure out how it works... Code: public class PDFTextExtractor { /// BT = Beginning of a text object operator /// ET = End of a text object operator /// Td move to the start of next line /// 5 Ts = superscript /// -5 Ts = subscript #region Fields #region _numberOfCharsToKeep /// <summary> /// The number of characters to keep, when extracting text. /// </summary> private static int _numberOfCharsToKeep = 15; #endregion #endregion #region ExtractTextFromPDFBytes /// <summary> /// This method processes an uncompressed Adobe (text) object /// and extracts text. /// </summary> /// <param name="input">uncompressed</param> /// <returns></returns> public string ExtractTextFromPDFBytes(byte[] input) { if (input == null \|\| input.Length <= 100) return ""; try { string resultString = ""; // Flag showing if we are we currently inside a text object bool inTextObject = false; // Flag showing if the next character is literal // e.g. '\\' to get a '\' character or '\(' to get '(' bool nextLiteral = false; // () Bracket nesting level. Text appears inside () int bracketDepth = 0; // Keep previous chars to get extract numbers etc.: char[] previousCharacters = new char[_numberOfCharsToKeep]; for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' '; for (int i = 0; i < input.Length; i++) { char c = (char)input[i]; if (inTextObject) { // Position the text if (bracketDepth == 0) { if (CheckToken(new string[] { "TD", "Td" }, previousCharacters)) { resultString += "\n\r"; } else { if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters)) { resultString += "\n"; } else { if (CheckToken(new string[] { "Tj" }, previousCharacters)) { resultString += " "; } } } } // End of a text object, also go to a new line. if (bracketDepth == 0 && CheckToken(new string[] { "ET" }, previousCharacters)) { inTextObject = false; resultString += " "; } else { // Start outputting text if ((c == '(') && (bracketDepth == 0) && (!nextLiteral)) { bracketDepth = 1; } else { // Stop outputting text if ((c == ')') && (bracketDepth == 1) && (!nextLiteral)) { bracketDepth = 0; } else { // Just a normal text character: if (bracketDepth == 1) { // Only print out next character no matter what. // Do not interpret. if (c == '\\' && !nextLiteral) { nextLiteral = true; } else { if (((c >= ' ') && (c <= '~')) \|\| ((c >= 128) && (c < 255))) { resultString += c.ToString(); } nextLiteral = false; } } } } } } // Store the recent characters for // when we have to go back for a checking for (int j = 0; j < _numberOfCharsToKeep - 1; j++) { previousCharacters[j] = previousCharacters[j + 1]; } previousCharacters[_numberOfCharsToKeep - 1] = c; // Start of a text object if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters)) { inTextObject = true; } } return resultString; } catch { return ""; } } #endregion #region CheckToken /// <summary> /// Check if a certain 2 character token just came along (e.g. BT) /// </summary> /// <param name="search">the searched token</param> /// <param name="recent">the recent character array</param> /// <returns></returns> private bool CheckToken(string[] tokens, char[] recent) { foreach (string token in tokens) { if (token.Length > 1) { if ((recent[_numberOfCharsToKeep - 3] == token[0]) && (recent[_numberOfCharsToKeep - 2] == token[1]) && ((recent[_numberOfCharsToKeep - 1] == ' ') \|\| (recent[_numberOfCharsToKeep - 1] == 0x0d) \|\| (recent[_numberOfCharsToKeep - 1] == 0x0a)) && ((recent[_numberOfCharsToKeep - 4] == ' ') \|\| (recent[_numberOfCharsToKeep - 4] == 0x0d) \|\| (recent[_numberOfCharsToKeep - 4] == 0x0a)) ) { return true; } } else { return false; } } return false; } #endregion }

Author:	() => true [ Fri Aug 26, 2016 1:02 pm ]
Post subject:	Re: How to read text from a pdf with 2 stremas
dolo33 wrote: I don't think it's a filter problem because I don't even know how to access the second stream? To me it is unclear what the problem is. Now I see a lot of code, but still no PDF. I assume that something goes wrong when the line Code: outputText += new PDFTextExtractor().ExtractTextFromPDFBytes(stream.Value) + Environment.NewLine; is invoked for the second time, but I'm not sure what goes wrong. A solution based on our Issue Submission Template would be great. See also: viewtopic.php?f=2&t=832

Author:	Gerben Vos [ Mon Aug 29, 2016 10:12 am ]
Post subject:	Re: How to read text from a pdf with 2 stremas
The second stream in the PDF fragment you posted is an XObject. One way to get at the XObject is: Code: PdfPage page = ...; PdfDictionary xObjects = page.Resources.Elements.GetDictionary("/XObject"); PdfDictionary xobj = xObjects.Elements.GetDictionary("/Xf1"); PdfStream stream = xobj.Stream; Something like stream.UnfilteredValue should decode this just fine. You can find the name of the XObject as an operand of the Do operator in the content stream, as you wrote yourself: Quote: q 1 0 0 1 0 0 cm /Xf1 Do Q For more details, please read up on XObjects in the pdf spec.

PDFsharp & MigraDoc Foundation https://forum.pdfsharp.net/

How to read text from a pdf with 2 stremas https://forum.pdfsharp.net/viewtopic.php?f=2&t=3435	Page 1 of 1

Author:	dolo33 [ Tue Aug 30, 2016 3:56 pm ]
Post subject:	Re: How to read text from a pdf with 2 stremas
Gerben Vos wrote: The second stream in the PDF fragment you posted is an XObject. One way to get at the XObject is: Code: PdfPage page = ...; PdfDictionary xObjects = page.Resources.Elements.GetDictionary("/XObject"); PdfDictionary xobj = xObjects.Elements.GetDictionary("/Xf1"); PdfStream stream = xobj.Stream; Something like stream.UnfilteredValue should decode this just fine. You can find the name of the XObject as an operand of the Do operator in the content stream, as you wrote yourself: Quote: q 1 0 0 1 0 0 cm /Xf1 Do Q For more details, please read up on XObjects in the pdf spec. Ok as you said, it works fine with your method and UnfilteredValue to decode text except that : PdfDictionary xObjects = page.Resources.Elements.GetDictionary("/XObject"); didn't work so i used : PdfDictionary resources = page.Elements.GetDictionary("/Resources"); PdfDictionary xObjects = resources.Elements.GetDictionary("/XObject"); Anyway, many many thanks !

Author:	Gerben Vos [ Thu Sep 08, 2016 7:56 pm ]
Post subject:	Re: How to read text from a pdf with 2 stremas
Hmm, worked for me. Do you use version 1.50beta3b, or an earlier one?

Page 1 of 1	All times are UTC
Powered by phpBB® Forum Software © phpBB Group https://www.phpbb.com/