PDFsharp & MigraDoc Foundation • View topic - How to Extract Unicode Text From PDF Files?

View unanswered posts | View active topics

Board index » PDFsharp & MigraDoc » Support

All times are UTC

Forum rules

Please read this before posting on this forum: Forum Rules

How to Extract Unicode Text From PDF Files?

Moderator: Stefan Lange

Page 1 of 1

[ 3 posts ]

Print view

Previous topic | Next topic

Author

Message

mohammad

Post subject: How to Extract Unicode Text From PDF Files?

Posted: Tue Nov 04, 2008 11:08 am

Joined: Tue Nov 04, 2008 10:42 am
Posts: 1

Dear All,
Is there any way to use PDFSharp or MigraDoc to extract text (Unicode and ASCII) from pdf files?

Top

marihanzo

Post subject: Extract text from pdf

Posted: Tue Mar 17, 2009 4:23 pm

Joined: Tue Mar 17, 2009 4:12 pm
Posts: 2

After a long search over the internet I couldn't find anything related to this problem.

So I decided to implement my own function to extract text from pdf files in C#.
Starting from a procedure found into samples of iText library, I coded this class that does what I need.

Infortunately I can't guarantee that text extracted is always the whole content of pdf file, but during my tests fortunately it was

.

I hope this will help, the class code follows.

PDFParser class

Code:

    public class PDFParser
    {
        /// BT = Beginning of a text object operator 
        /// ET = End of a text object operator
        /// Td move to the start of next line
        ///  5 Ts = superscript
        /// -5 Ts = subscript

        #region Fields

        #region _numberOfCharsToKeep
        /// <summary>
        /// The number of characters to keep, when extracting text.
        /// </summary>
        private static int _numberOfCharsToKeep = 15;
        #endregion

        #endregion

      

        #region ExtractTextFromPDFBytes
        /// <summary>
        /// This method processes an uncompressed Adobe (text) object 
        /// and extracts text.
        /// </summary>
        /// <param name="input">uncompressed</param>
        /// <returns></returns>
        public string ExtractTextFromPDFBytes(byte[] input)
        {
            if (input == null || input.Length == 0) return "";

            try
            {
                string resultString = "";

                // Flag showing if we are we currently inside a text object
                bool inTextObject = false;

                // Flag showing if the next character is literal 
                // e.g. '\\' to get a '\' character or '\(' to get '('
                bool nextLiteral = false;

                // () Bracket nesting level. Text appears inside ()
                int bracketDepth = 0;

                // Keep previous chars to get extract numbers etc.:
                char[] previousCharacters = new char[_numberOfCharsToKeep];
                for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';


                for (int i = 0; i < input.Length; i++)
                {
                    char c = (char)input[i];

                    if (inTextObject)
                    {
                        // Position the text
                        if (bracketDepth == 0)
                        {
                            if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
                            {
                                resultString += "\n\r";
                            }
                            else
                            {
                                if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters))
                                {
                                    resultString += "\n";
                                }
                                else
                                {
                                    if (CheckToken(new string[] { "Tj" }, previousCharacters))
                                    {
                                        resultString += " ";
                                    }
                                }
                            }
                        }

                        // End of a text object, also go to a new line.
                        if (bracketDepth == 0 &&
                            CheckToken(new string[] { "ET" }, previousCharacters))
                        {

                            inTextObject = false;
                            resultString += " ";
                        }
                        else
                        {
                            // Start outputting text
                            if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
                            {
                                bracketDepth = 1;
                            }
                            else
                            {
                                // Stop outputting text
                                if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
                                {
                                    bracketDepth = 0;
                                }
                                else
                                {
                                    // Just a normal text character:
                                    if (bracketDepth == 1)
                                    {
                                        // Only print out next character no matter what. 
                                        // Do not interpret.
                                        if (c == '\\' && !nextLiteral)
                                        {
                                            nextLiteral = true;
                                        }
                                        else
                                        {
                                            if (((c >= ' ') && (c <= '~')) ||
                                                ((c >= 128) && (c < 255)))
                                            {
                                                resultString += c.ToString();
                                            }

                                            nextLiteral = false;
                                        }
                                    }
                                }
                            }
                        }
                    }

                    // Store the recent characters for 
                    // when we have to go back for a checking
                    for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
                    {
                        previousCharacters[j] = previousCharacters[j + 1];
                    }
                    previousCharacters[_numberOfCharsToKeep - 1] = c;

                    // Start of a text object
                    if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
                    {
                        inTextObject = true;
                    }
                }
                return resultString;
            }
            catch
            {
                return "";
            }
        }
        #endregion

        #region CheckToken
        /// <summary>
        /// Check if a certain 2 character token just came along (e.g. BT)
        /// </summary>
        /// <param name="search">the searched token</param>
        /// <param name="recent">the recent character array</param>
        /// <returns></returns>
        private bool CheckToken(string[] tokens, char[] recent)
        {
            foreach (string token in tokens)
            {
                if (token.Length > 1)
                {
                    if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
                        (recent[_numberOfCharsToKeep - 2] == token[1]) &&
                        ((recent[_numberOfCharsToKeep - 1] == ' ') ||
                        (recent[_numberOfCharsToKeep - 1] == 0x0d) ||
                        (recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
                        ((recent[_numberOfCharsToKeep - 4] == ' ') ||
                        (recent[_numberOfCharsToKeep - 4] == 0x0d) ||
                        (recent[_numberOfCharsToKeep - 4] == 0x0a))
                        )
                    {
                        return true;
                    }
                }
                else
                {
                    return false;
                }

            }
            return false;
        }
        #endregion
    }

Application code

Code:

       public override String ExtractText()
        {
            String outputText = "";
            try
            {
                PdfDocument inputDocument = PdfReader.Open(this._sDirectory + this._sFileName, PdfDocumentOpenMode.ReadOnly);

                foreach (PdfPage page in inputDocument.Pages)
                {
                    for (int index = 0; index < page.Contents.Elements.Count; index++)
                    {

                        PdfDictionary.PdfStream stream = page.Contents.Elements.GetDictionary(index).Stream;
                        outputText += new PDFParser().ExtractTextFromPDFBytes(stream.Value);
                    }
                }

            }
            catch (Exception e)
            {
                PDF_ParseException oEx = new PDF_ParseException(this, e);
                oEx.Log();
                oEx.ToPdf(this._sDirectoryException);
            }
            return outputText;
        }

Enjoy it!

Top

making1971

Post subject: Re: How to Extract Unicode Text From PDF Files?

Posted: Tue Sep 04, 2012 11:06 pm

Joined: Tue Sep 04, 2012 10:42 pm
Posts: 2

Hi All,

I'm fairly new at this so pleas go gentle

I've played about this code and it seems to get the text out of my document but in one long line with no line breaks etc... Anyone know of a simple way round this.

Cheers,

Top

Page 1 of 1

[ 3 posts ]

Board index » PDFsharp & MigraDoc » Support

All times are UTC

Who is online

Users browsing this forum: No registered users and 20 guests

You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot post attachments in this forum