PDFsharp & MigraDoc Foundation :: View topic - How to Extract Unicode Text From PDF Files?

public class PDFParser
{
/// BT = Beginning of a text object operator
/// ET = End of a text object operator
/// Td move to the start of next line
/// 5 Ts = superscript
/// -5 Ts = subscript

#region Fields

#region _numberOfCharsToKeep
/// <summary>
/// The number of characters to keep, when extracting text.
/// </summary>
private static int _numberOfCharsToKeep = 15;
#endregion

#endregion

#region ExtractTextFromPDFBytes
/// <summary>
/// This method processes an uncompressed Adobe (text) object
/// and extracts text.
/// </summary>
/// <param name="input">uncompressed</param>
/// <returns></returns>
public string ExtractTextFromPDFBytes(byte[] input)
{
if (input == null || input.Length == 0) return "";

try
{
string resultString = "";

// Flag showing if we are we currently inside a text object
bool inTextObject = false;

// Flag showing if the next character is literal
// e.g. '\\' to get a '\' character or '\(' to get '('
bool nextLiteral = false;

// () Bracket nesting level. Text appears inside ()
int bracketDepth = 0;

// Keep previous chars to get extract numbers etc.:
char[] previousCharacters = new char[_numberOfCharsToKeep];
for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';

for (int i = 0; i < input.Length; i++)
{
char c = (char)input[i];

if (inTextObject)
{
// Position the text
if (bracketDepth == 0)
{
if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
{
resultString += "\n\r";
}
else
{
if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters))
{
resultString += "\n";
}
else
{
if (CheckToken(new string[] { "Tj" }, previousCharacters))
{
resultString += " ";
}
}
}
}

// End of a text object, also go to a new line.
if (bracketDepth == 0 &&
CheckToken(new string[] { "ET" }, previousCharacters))
{

inTextObject = false;
resultString += " ";
}
else
{
// Start outputting text
if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
{
bracketDepth = 1;
}
else
{
// Stop outputting text
if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
{
bracketDepth = 0;
}
else
{
// Just a normal text character:
if (bracketDepth == 1)
{
// Only print out next character no matter what.
// Do not interpret.
if (c == '\\' && !nextLiteral)
{
nextLiteral = true;
}
else
{
if (((c >= ' ') && (c <= '~')) ||
((c >= 128) && (c < 255)))
{
resultString += c.ToString();
}

nextLiteral = false;
}
}
}
}
}
}

// Store the recent characters for
// when we have to go back for a checking
for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
{
previousCharacters[j] = previousCharacters[j + 1];
}
previousCharacters[_numberOfCharsToKeep - 1] = c;

// Start of a text object
if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
{
inTextObject = true;
}
}
return resultString;
}
catch
{
return "";
}
}
#endregion

#region CheckToken
/// <summary>
/// Check if a certain 2 character token just came along (e.g. BT)
/// </summary>
/// <param name="search">the searched token</param>
/// <param name="recent">the recent character array</param>
/// <returns></returns>
private bool CheckToken(string[] tokens, char[] recent)
{
foreach (string token in tokens)
{
if (token.Length > 1)
{
if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
(recent[_numberOfCharsToKeep - 2] == token[1]) &&
((recent[_numberOfCharsToKeep - 1] == ' ') ||
(recent[_numberOfCharsToKeep - 1] == 0x0d) ||
(recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
((recent[_numberOfCharsToKeep - 4] == ' ') ||
(recent[_numberOfCharsToKeep - 4] == 0x0d) ||
(recent[_numberOfCharsToKeep - 4] == 0x0a))
)
{
return true;
}
}
else
{
return false;
}

}
return false;
}
#endregion
}

Author:	mohammad [ Tue Nov 04, 2008 11:08 am ]
Post subject:	How to Extract Unicode Text From PDF Files?
Dear All, Is there any way to use PDFSharp or MigraDoc to extract text (Unicode and ASCII) from pdf files?

Author:	marihanzo [ Tue Mar 17, 2009 4:23 pm ]
Post subject:	Extract text from pdf
After a long search over the internet I couldn't find anything related to this problem. So I decided to implement my own function to extract text from pdf files in C#. Starting from a procedure found into samples of iText library, I coded this class that does what I need. Infortunately I can't guarantee that text extracted is always the whole content of pdf file, but during my tests fortunately it was . I hope this will help, the class code follows. PDFParser class Code: public class PDFParser { /// BT = Beginning of a text object operator /// ET = End of a text object operator /// Td move to the start of next line /// 5 Ts = superscript /// -5 Ts = subscript #region Fields #region _numberOfCharsToKeep /// <summary> /// The number of characters to keep, when extracting text. /// </summary> private static int _numberOfCharsToKeep = 15; #endregion #endregion #region ExtractTextFromPDFBytes /// <summary> /// This method processes an uncompressed Adobe (text) object /// and extracts text. /// </summary> /// <param name="input">uncompressed</param> /// <returns></returns> public string ExtractTextFromPDFBytes(byte[] input) { if (input == null \|\| input.Length == 0) return ""; try { string resultString = ""; // Flag showing if we are we currently inside a text object bool inTextObject = false; // Flag showing if the next character is literal // e.g. '\\' to get a '\' character or '\(' to get '(' bool nextLiteral = false; // () Bracket nesting level. Text appears inside () int bracketDepth = 0; // Keep previous chars to get extract numbers etc.: char[] previousCharacters = new char[_numberOfCharsToKeep]; for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' '; for (int i = 0; i < input.Length; i++) { char c = (char)input[i]; if (inTextObject) { // Position the text if (bracketDepth == 0) { if (CheckToken(new string[] { "TD", "Td" }, previousCharacters)) { resultString += "\n\r"; } else { if (CheckToken(new string[] { "'", "T", "\"" }, previousCharacters)) { resultString += "\n"; } else { if (CheckToken(new string[] { "Tj" }, previousCharacters)) { resultString += " "; } } } } // End of a text object, also go to a new line. if (bracketDepth == 0 && CheckToken(new string[] { "ET" }, previousCharacters)) { inTextObject = false; resultString += " "; } else { // Start outputting text if ((c == '(') && (bracketDepth == 0) && (!nextLiteral)) { bracketDepth = 1; } else { // Stop outputting text if ((c == ')') && (bracketDepth == 1) && (!nextLiteral)) { bracketDepth = 0; } else { // Just a normal text character: if (bracketDepth == 1) { // Only print out next character no matter what. // Do not interpret. if (c == '\\' && !nextLiteral) { nextLiteral = true; } else { if (((c >= ' ') && (c <= '~')) \|\| ((c >= 128) && (c < 255))) { resultString += c.ToString(); } nextLiteral = false; } } } } } } // Store the recent characters for // when we have to go back for a checking for (int j = 0; j < _numberOfCharsToKeep - 1; j++) { previousCharacters[j] = previousCharacters[j + 1]; } previousCharacters[_numberOfCharsToKeep - 1] = c; // Start of a text object if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters)) { inTextObject = true; } } return resultString; } catch { return ""; } } #endregion #region CheckToken /// <summary> /// Check if a certain 2 character token just came along (e.g. BT) /// </summary> /// <param name="search">the searched token</param> /// <param name="recent">the recent character array</param> /// <returns></returns> private bool CheckToken(string[] tokens, char[] recent) { foreach (string token in tokens) { if (token.Length > 1) { if ((recent[_numberOfCharsToKeep - 3] == token[0]) && (recent[_numberOfCharsToKeep - 2] == token[1]) && ((recent[_numberOfCharsToKeep - 1] == ' ') \|\| (recent[_numberOfCharsToKeep - 1] == 0x0d) \|\| (recent[_numberOfCharsToKeep - 1] == 0x0a)) && ((recent[_numberOfCharsToKeep - 4] == ' ') \|\| (recent[_numberOfCharsToKeep - 4] == 0x0d) \|\| (recent[_numberOfCharsToKeep - 4] == 0x0a)) ) { return true; } } else { return false; } } return false; } #endregion } Application code* Code: public override String ExtractText() { String outputText = ""; try { PdfDocument inputDocument = PdfReader.Open(this._sDirectory + this._sFileName, PdfDocumentOpenMode.ReadOnly); foreach (PdfPage page in inputDocument.Pages) { for (int index = 0; index < page.Contents.Elements.Count; index++) { PdfDictionary.PdfStream stream = page.Contents.Elements.GetDictionary(index).Stream; outputText += new PDFParser().ExtractTextFromPDFBytes(stream.Value); } } } catch (Exception e) { PDF_ParseException oEx = new PDF_ParseException(this, e); oEx.Log(); oEx.ToPdf(this._sDirectoryException); } return outputText; } Enjoy it!

Author:	making1971 [ Tue Sep 04, 2012 11:06 pm ]
Post subject:	Re: How to Extract Unicode Text From PDF Files?
Hi All, I'm fairly new at this so pleas go gentle I've played about this code and it seems to get the text out of my document but in one long line with no line breaks etc... Anyone know of a simple way round this. Cheers,

PDFsharp & MigraDoc Foundation https://forum.pdfsharp.net/

How to Extract Unicode Text From PDF Files? https://forum.pdfsharp.net/viewtopic.php?f=2&t=527	Page 1 of 1

Page 1 of 1	All times are UTC
Powered by phpBB® Forum Software © phpBB Group https://www.phpbb.com/