using System; using System.Text.RegularExpressions; /* * (c) Craig Dunn - ConceptDevelopment.NET * 5-July-04 * * To use: * string encoded = ExtendedHtmlUtility.HtmlEntityEncode ("test string with Unicode chars and & < >"); * string decoded = ExtendedHtmlUtility.HtmlEntityDecode (encoded); // "string with & < >" */ public class ExtendedHtmlUtility { /// <summary> /// Based on the 'reflected' code (from the Framework System.Web.HttpServerUtility) /// listed on this page /// UrlEncode vs. HtmlEncode /// http://www.aspnetresources.com/blog/encoding_forms.aspx /// /// PDF of unicode characters in the 0-127 (dec) range /// http://www.unicode.org/charts/PDF/U0000.pdf /// </summary> /// <param name="unicodeText"></param> /// <returns> /// & becomes & (encoded for XML Comments - don't be confused) /// 1-9a-zA-Z and some punctuation (ASCII, basically) remain unchanged /// </returns> public static string HtmlEntityEncode (string unicodeText) { int unicodeVal; string encoded=String.Empty; foreach (char c in unicodeText) { unicodeVal = c; switch (unicodeVal) { case '&': encoded += "&"; break; case '<': encoded += "<"; break; case '>': encoded += ">"; break; default: if ((c >= ' ') && (c <= 0x007E)) { // from 'space' to '~tilde' hex 20-7E (dec 32-127) // in 'ascii' range x30 to x7a which is 0-9A-Za-z plus some punctuation encoded += c; // leave as-is } else { // outside 'ascii' range - encode encoded += string.Concat("&#", unicodeVal.ToString(System.Globalization.NumberFormatInfo.InvariantInfo), ";"); } break; } } return encoded; } // HtmlEntityEncode /// <summary> /// Converts Html Entities back to their 'underlying' Unicode characters /// </summary> /// <param name="encodedText"></param> /// <returns> /// & becomes & (encoded for XML Comments - don't be confused) /// 1-9a-zA-Z and some punctuation (ASCII, basically) remain unchanged /// </returns> public static string HtmlEntityDecode (string encodedText) { return entityResolver.Replace (encodedText, new MatchEvaluator (ResolveEntity) ); } // HtmlEntityDecode /// <summary> /// Static Regular Expression to match Html Entities in encoded text /// </summary> private static Regex entityResolver = new Regex (@"([&][#](?'unicode'\d+);)|([&](?'html'\w+);)"); /// <summary> /// Regex Match processing delegate to replace the Entities with their /// underlying Unicode character. /// /// List of entities /// http://www.vigay.com/inet/acorn/browse-html2.html#entities /// </summary> /// <param name="matchToProcess"></param> /// <returns> /// & becomes & (encoded for XML Comments - don't be confused) /// </returns> private static string ResolveEntity (System.Text.RegularExpressions.Match matchToProcess) { string x = "X"; // default 'char placeholder' if cannot be resolved - shouldn't occur if (matchToProcess.Groups["unicode"].Success) { x = Convert.ToChar(Convert.ToInt32(matchToProcess.Groups["unicode"].Value) ).ToString(); } else { if (matchToProcess.Groups["html"].Success) { switch (matchToProcess.Groups["html"].Value.ToLower()) { // this could be expanded to as many as you like, or (maybe) // System.Web.HttpUtility.HtmlDecode will work on // the whole 'entity' string... ? case "nbsp": x = " "; break; case "copy": x = Convert.ToChar(0x00A9).ToString() ; break; case "lt" : x = "<"; break; case "gt" : x = ">"; break; case "amp" : x = "&"; break; // finish this switch from this info // http://www.vigay.com/inet/acorn/browse-html2.html#entities // otherwise some will be 'X' (the default) } } } return x; } // ResolveEntity() } // class ExtendedHtmlUtility