.net, C#, Code, Microsoft, Standards, Technical, XML

Strip Illegal XML Characters based on W3C standard

W3C has defined a set of illegal characters for use in XML . You can find info about the same here:

XML 1.0 | XML 1.1

Here is a function to remove these characters from a specified XML file:

using System;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;

namespace XMLUtils
{
    class Standards
    {
        /// <summary>
        /// Strips non-printable ascii characters 
        /// Refer to http://www.w3.org/TR/xml11/#charsets for XML 1.1
        /// Refer to http://www.w3.org/TR/2006/REC-xml-20060816/#charsets for XML 1.0
        /// </summary>
        /// <param name="filePath">Full path to the File</param>
        /// <param name="XMLVersion">XML Specification to use. Can be 1.0 or 1.1</param>
        private void StripIllegalXMLChars(string filePath, string XMLVersion)
        {
            //Remove illegal character sequences
            string tmpContents = File.ReadAllText(filePath, Encoding.UTF8);

            string pattern = String.Empty;
            switch (XMLVersion)
            {
                case "1.0":
                    pattern = @"#x((10?|[2-F])FFF[EF]|FDD[0-9A-F]|7F|8[0-46-9A-F]9[0-9A-F])";
                    break;
                case "1.1":
                    pattern = @"#x((10?|[2-F])FFF[EF]|FDD[0-9A-F]|[19][0-9A-F]|7F|8[0-46-9A-F]|0?[1-8BCEF])";
                    break;
                default:
                    throw new Exception("Error: Invalid XML Version!");
            }

            Regex regex = new Regex(pattern, RegexOptions.IgnoreCase);
            if (regex.IsMatch(tmpContents))
            {
                tmpContents = regex.Replace(tmpContents, String.Empty);
                File.WriteAllText(filePath, tmpContents, Encoding.UTF8);
            }
            tmpContents = string.Empty;
        }
    }
}
Advertisements

23 thoughts on “Strip Illegal XML Characters based on W3C standard

  1. Here is the php version:

    unction strip_invalid_xml_chars2( $in )
    {

    $out = “”;

    $length = strlen($in);

    for ( $i = 0; $i = 0x20)
    && ($current = 0xE000) &&
    ($current = 0x10000) && ($current <= 0x10FFFF)))
    {
    $out .= chr($current);
    }
    else
    {
    $out .= ” “;
    }

    }

    return $out;

    }

  2. Hi Ramesh,
    i just want the code for trimming non-printable characters.
    The one placed on this page is going beyond the margins.

    i would appriciate if you can just send it across to my mail id mentioned above.

    thanks,
    Nagesh

  3. hi Balaji

    Could you please email the code as I am unable to view it due to margin issues on the webpage? I would really appreciate that.

    Thanks again

  4. Hi!

    I tried implementing your function, but now I get an error saying “Illegal characters in path”. Any suggestions please?

    many thanks,
    TS

  5. Hello Balaji,

    I just came across your implementation, is it possible to email me your solution…thanks very much.

    -Minhas

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s