Logo Questions Linux Laravel Mysql Ubuntu Git Menu

How to convert an rtf string to text in C#



Is there an easy way to extract text from an Rtf string without using RichTextBox?


{\rtf1\ansi\ansicpg1252\uc1\htmautsp\deff2{\fonttbl{\f0\fcharset0 Times New Roman;}{\f2\fcharset0 Segoe UI;}}{\colortbl\red0\green0\blue0;\red255\green255\blue255;}\loch\hich\dbch\pard\plain\ltrpar\itap0{\lang1033\fs18\f2\cf0 \cf0\ql{\f2 {\lang2070\ltrch foo}\li0\ri0\sa0\sb0\fi0\ql\par} 
{\f2 {\lang2070\ltrch bar }\li0\ri0\sa0\sb0\fi0\ql\par}

should return:

like image 226
dcarneiro Avatar asked Apr 12 '11 11:04


People also ask

How do I convert RTF text to plain text?

On the File tab, choose Options > Mail. Under Compose messages, in the Compose messages in this format list, click HTML, Rich Text, or Plain Text.

What is RTF string?

Rich Text Format (RTF) is a text formatting language devised by Microsoft Corporation. You can represent character, paragraph, and document format attributes using plain text with interspersed RTF commands, groups, and escape sequences.

1 Answers

How to do it in pure C# without any references to other libraries:

This guy wrote a class that strips RTF to plain text just as OP requested. Here is the source

This is his code:

    /// <summary>
    /// Rich Text Stripper
    /// </summary>
    /// <remarks>
    /// Translated from Python located at:
    /// http://stackoverflow.com/a/188877/448
    /// </remarks>
    public static class RichTextStripper
        private class StackEntry
            public int NumberOfCharactersToSkip { get; set; }
            public bool Ignorable { get; set; }

            public StackEntry(int numberOfCharactersToSkip, bool ignorable)
                NumberOfCharactersToSkip = numberOfCharactersToSkip;
                Ignorable = ignorable;

        private static readonly Regex _rtfRegex = new Regex(@"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", RegexOptions.Singleline | RegexOptions.IgnoreCase);

        private static readonly List<string> destinations = new List<string>

        private static readonly Dictionary<string, string> specialCharacters = new Dictionary<string, string>
        { "par", "\n" },
        { "sect", "\n\n" },
        { "page", "\n\n" },
        { "line", "\n" },
        { "tab", "\t" },
        { "emdash", "\u2014" },
        { "endash", "\u2013" },
        { "emspace", "\u2003" },
        { "enspace", "\u2002" },
        { "qmspace", "\u2005" },
        { "bullet", "\u2022" },
        { "lquote", "\u2018" },
        { "rquote", "\u2019" },
        { "ldblquote", "\u201C" },
        { "rdblquote", "\u201D" },
        /// <summary>
        /// Strip RTF Tags from RTF Text
        /// </summary>
        /// <param name="inputRtf">RTF formatted text</param>
        /// <returns>Plain text from RTF</returns>
        public static string StripRichTextFormat(string inputRtf)
            if (inputRtf == null)
                return null;

            string returnString;

            var stack = new Stack<StackEntry>();
            bool ignorable = false;              // Whether this group (and all inside it) are "ignorable".
            int ucskip = 1;                      // Number of ASCII characters to skip after a unicode character.
            int curskip = 0;                     // Number of ASCII characters left to skip
            var outList = new List<string>();    // Output buffer.

            MatchCollection matches = _rtfRegex.Matches(inputRtf);

            if (matches.Count > 0)
                foreach (Match match in matches)
                    string word = match.Groups[1].Value;
                    string arg = match.Groups[2].Value;
                    string hex = match.Groups[3].Value;
                    string character = match.Groups[4].Value;
                    string brace = match.Groups[5].Value;
                    string tchar = match.Groups[6].Value;

                    if (!String.IsNullOrEmpty(brace))
                        curskip = 0;
                        if (brace == "{")
                            // Push state
                            stack.Push(new StackEntry(ucskip, ignorable));
                        else if (brace == "}")
                            // Pop state
                            StackEntry entry = stack.Pop();
                            ucskip = entry.NumberOfCharactersToSkip;
                            ignorable = entry.Ignorable;
                    else if (!String.IsNullOrEmpty(character)) // \x (not a letter)
                        curskip = 0;
                        if (character == "~")
                            if (!ignorable)
                        else if ("{}\\".Contains(character))
                            if (!ignorable)
                        else if (character == "*")
                            ignorable = true;
                    else if (!String.IsNullOrEmpty(word)) // \foo
                        curskip = 0;
                        if (destinations.Contains(word))
                            ignorable = true;
                        else if (ignorable)
                        else if (specialCharacters.ContainsKey(word))
                        else if (word == "uc")
                            ucskip = Int32.Parse(arg);
                        else if (word == "u")
                            int c = Int32.Parse(arg);
                            if (c < 0)
                                c += 0x10000;
                            curskip = ucskip;
                    else if (!String.IsNullOrEmpty(hex)) // \'xx
                        if (curskip > 0)
                            curskip -= 1;
                        else if (!ignorable)
                            int c = Int32.Parse(hex, System.Globalization.NumberStyles.HexNumber);
                    else if (!String.IsNullOrEmpty(tchar))
                        if (curskip > 0)
                            curskip -= 1;
                        else if (!ignorable)
                // Didn't match the regex
                returnString = inputRtf;

            returnString = String.Join(String.Empty, outList.ToArray());

            return returnString;

EDIT 1: In the meantime we had this code running for tests and adapted version in production. The new version does some additional safety checks & handles new lines better.

public static string StripRichTextFormat(string inputRtf)
        if (inputRtf == null)
            return null;

        string returnString;

        var stack = new Stack<StackEntry>();
        bool ignorable = false;              // Whether this group (and all inside it) are "ignorable".
        int ucskip = 1;                      // Number of ASCII characters to skip after a unicode character.
        int curskip = 0;                     // Number of ASCII characters left to skip
        var outList = new List<string>();    // Output buffer.

        MatchCollection matches = _rtfRegex.Matches(inputRtf);

        if (matches.Count > 0)
            foreach (Match match in matches)
                string word = match.Groups[1].Value;
                string arg = match.Groups[2].Value;
                string hex = match.Groups[3].Value;
                string character = match.Groups[4].Value;
                string brace = match.Groups[5].Value;
                string tchar = match.Groups[6].Value;

                if (!String.IsNullOrEmpty(brace))
                    curskip = 0;
                    if (brace == "{")
                        // Push state
                        stack.Push(new StackEntry(ucskip, ignorable));
                    else if (brace == "}")
                        // Pop state
                        StackEntry entry = stack.Pop();
                        ucskip = entry.NumberOfCharactersToSkip;
                        ignorable = entry.Ignorable;
                else if (!String.IsNullOrEmpty(character)) // \x (not a letter)
                    curskip = 0;
                    if (character == "~")
                        if (!ignorable)
                    else if ("{}\\".Contains(character))
                        if (!ignorable)
                    else if (character == "*")
                        ignorable = true;
                else if (!String.IsNullOrEmpty(word)) // \foo
                    curskip = 0;
                    if (destinations.Contains(word))
                        ignorable = true;
                    else if (ignorable)
                    else if (specialCharacters.ContainsKey(word))
                    else if (word == "uc")
                        ucskip = Int32.Parse(arg);
                    else if (word == "u")
                        int c = Int32.Parse(arg);
                        if (c < 0)
                            c += 0x10000;
                        //Ein gültiger UTF32-Wert ist zwischen 0x000000 und 0x10ffff (einschließlich) und sollte keine Ersatzcodepunktwerte (0x00d800 ~ 0x00dfff)
                        if (c >= 0x000000 && c <= 0x10ffff && (c < 0x00d800 || c > 0x00dfff))
                        else outList.Add("?");
                        curskip = ucskip;
                else if (!String.IsNullOrEmpty(hex)) // \'xx
                    if (curskip > 0)
                        curskip -= 1;
                    else if (!ignorable)
                        int c = Int32.Parse(hex, System.Globalization.NumberStyles.HexNumber);
                else if (!String.IsNullOrEmpty(tchar))
                    if (curskip > 0)
                        curskip -= 1;
                    else if (!ignorable)
            // Didn't match the regex
            returnString = inputRtf;

        returnString = String.Join(String.Empty, outList.ToArray());

        return returnString;
like image 166
Luchspeter Avatar answered Sep 28 '22 17:09
