The htmlentities() function converts characters to HTML entities. Tip: To convert HTML entities back to characters, use the html_entity_decode() function. Tip: Use the get_html_translation_table() function to return the translation table used by htmlentities().
Character entities are used to display reserved characters in HTML. &#entity_number; To display a less than sign (<) we must write: < or < Advantage of using an entity name: An entity name is easy to remember.
An HTML entity is a piece of text ("string") that begins with an ampersand ( & ) and ends with a semicolon ( ; ) . Entities are frequently used to display reserved characters (which would otherwise be interpreted as HTML code), and invisible characters (like non-breaking spaces).
With the help of bucabay and the advice to create my own function i created this one which works for me. What do you guys think, is there a better solution somewhere?
if(typeof escapeHtmlEntities == 'undefined') {
escapeHtmlEntities = function (text) {
return text.replace(/[\u00A0-\u2666<>\&]/g, function(c) {
return '&' +
(escapeHtmlEntities.entityTable[c.charCodeAt(0)] || '#'+c.charCodeAt(0)) + ';';
});
};
// all HTML4 entities as defined here: http://www.w3.org/TR/html4/sgml/entities.html
// added: amp, lt, gt, quot and apos
escapeHtmlEntities.entityTable = {
34 : 'quot',
38 : 'amp',
39 : 'apos',
60 : 'lt',
62 : 'gt',
160 : 'nbsp',
161 : 'iexcl',
162 : 'cent',
163 : 'pound',
164 : 'curren',
165 : 'yen',
166 : 'brvbar',
167 : 'sect',
168 : 'uml',
169 : 'copy',
170 : 'ordf',
171 : 'laquo',
172 : 'not',
173 : 'shy',
174 : 'reg',
175 : 'macr',
176 : 'deg',
177 : 'plusmn',
178 : 'sup2',
179 : 'sup3',
180 : 'acute',
181 : 'micro',
182 : 'para',
183 : 'middot',
184 : 'cedil',
185 : 'sup1',
186 : 'ordm',
187 : 'raquo',
188 : 'frac14',
189 : 'frac12',
190 : 'frac34',
191 : 'iquest',
192 : 'Agrave',
193 : 'Aacute',
194 : 'Acirc',
195 : 'Atilde',
196 : 'Auml',
197 : 'Aring',
198 : 'AElig',
199 : 'Ccedil',
200 : 'Egrave',
201 : 'Eacute',
202 : 'Ecirc',
203 : 'Euml',
204 : 'Igrave',
205 : 'Iacute',
206 : 'Icirc',
207 : 'Iuml',
208 : 'ETH',
209 : 'Ntilde',
210 : 'Ograve',
211 : 'Oacute',
212 : 'Ocirc',
213 : 'Otilde',
214 : 'Ouml',
215 : 'times',
216 : 'Oslash',
217 : 'Ugrave',
218 : 'Uacute',
219 : 'Ucirc',
220 : 'Uuml',
221 : 'Yacute',
222 : 'THORN',
223 : 'szlig',
224 : 'agrave',
225 : 'aacute',
226 : 'acirc',
227 : 'atilde',
228 : 'auml',
229 : 'aring',
230 : 'aelig',
231 : 'ccedil',
232 : 'egrave',
233 : 'eacute',
234 : 'ecirc',
235 : 'euml',
236 : 'igrave',
237 : 'iacute',
238 : 'icirc',
239 : 'iuml',
240 : 'eth',
241 : 'ntilde',
242 : 'ograve',
243 : 'oacute',
244 : 'ocirc',
245 : 'otilde',
246 : 'ouml',
247 : 'divide',
248 : 'oslash',
249 : 'ugrave',
250 : 'uacute',
251 : 'ucirc',
252 : 'uuml',
253 : 'yacute',
254 : 'thorn',
255 : 'yuml',
402 : 'fnof',
913 : 'Alpha',
914 : 'Beta',
915 : 'Gamma',
916 : 'Delta',
917 : 'Epsilon',
918 : 'Zeta',
919 : 'Eta',
920 : 'Theta',
921 : 'Iota',
922 : 'Kappa',
923 : 'Lambda',
924 : 'Mu',
925 : 'Nu',
926 : 'Xi',
927 : 'Omicron',
928 : 'Pi',
929 : 'Rho',
931 : 'Sigma',
932 : 'Tau',
933 : 'Upsilon',
934 : 'Phi',
935 : 'Chi',
936 : 'Psi',
937 : 'Omega',
945 : 'alpha',
946 : 'beta',
947 : 'gamma',
948 : 'delta',
949 : 'epsilon',
950 : 'zeta',
951 : 'eta',
952 : 'theta',
953 : 'iota',
954 : 'kappa',
955 : 'lambda',
956 : 'mu',
957 : 'nu',
958 : 'xi',
959 : 'omicron',
960 : 'pi',
961 : 'rho',
962 : 'sigmaf',
963 : 'sigma',
964 : 'tau',
965 : 'upsilon',
966 : 'phi',
967 : 'chi',
968 : 'psi',
969 : 'omega',
977 : 'thetasym',
978 : 'upsih',
982 : 'piv',
8226 : 'bull',
8230 : 'hellip',
8242 : 'prime',
8243 : 'Prime',
8254 : 'oline',
8260 : 'frasl',
8472 : 'weierp',
8465 : 'image',
8476 : 'real',
8482 : 'trade',
8501 : 'alefsym',
8592 : 'larr',
8593 : 'uarr',
8594 : 'rarr',
8595 : 'darr',
8596 : 'harr',
8629 : 'crarr',
8656 : 'lArr',
8657 : 'uArr',
8658 : 'rArr',
8659 : 'dArr',
8660 : 'hArr',
8704 : 'forall',
8706 : 'part',
8707 : 'exist',
8709 : 'empty',
8711 : 'nabla',
8712 : 'isin',
8713 : 'notin',
8715 : 'ni',
8719 : 'prod',
8721 : 'sum',
8722 : 'minus',
8727 : 'lowast',
8730 : 'radic',
8733 : 'prop',
8734 : 'infin',
8736 : 'ang',
8743 : 'and',
8744 : 'or',
8745 : 'cap',
8746 : 'cup',
8747 : 'int',
8756 : 'there4',
8764 : 'sim',
8773 : 'cong',
8776 : 'asymp',
8800 : 'ne',
8801 : 'equiv',
8804 : 'le',
8805 : 'ge',
8834 : 'sub',
8835 : 'sup',
8836 : 'nsub',
8838 : 'sube',
8839 : 'supe',
8853 : 'oplus',
8855 : 'otimes',
8869 : 'perp',
8901 : 'sdot',
8968 : 'lceil',
8969 : 'rceil',
8970 : 'lfloor',
8971 : 'rfloor',
9001 : 'lang',
9002 : 'rang',
9674 : 'loz',
9824 : 'spades',
9827 : 'clubs',
9829 : 'hearts',
9830 : 'diams',
338 : 'OElig',
339 : 'oelig',
352 : 'Scaron',
353 : 'scaron',
376 : 'Yuml',
710 : 'circ',
732 : 'tilde',
8194 : 'ensp',
8195 : 'emsp',
8201 : 'thinsp',
8204 : 'zwnj',
8205 : 'zwj',
8206 : 'lrm',
8207 : 'rlm',
8211 : 'ndash',
8212 : 'mdash',
8216 : 'lsquo',
8217 : 'rsquo',
8218 : 'sbquo',
8220 : 'ldquo',
8221 : 'rdquo',
8222 : 'bdquo',
8224 : 'dagger',
8225 : 'Dagger',
8240 : 'permil',
8249 : 'lsaquo',
8250 : 'rsaquo',
8364 : 'euro'
};
}
usage example:
var text = "Übergroße Äpfel mit Würmern";
alert(escapeHtmlEntities (text));
result:
Übergroße Äpfel mit Würmern
Update1: Thanks bucabay again for the || - hint
Update2: Updated entity table with amp,lt,gt,apos,quot, thanks richardtallent for the hint
Update3(in 2014): Mathias Bynens created a lib called 'he', maybe it serves your need.
All the other solutions suggested here, as well as most other JavaScript libraries that do HTML entity encoding/decoding, make several mistakes:
htmlDecode('≼')
should return '≼'
(i.e. '\u227C'
).htmlEncode('𝌆')
should return something like 𝌆
or 𝌆
. If an implementation returns two separate entities instead (e.g. ��
or ��
), it is broken.htmlDecode('𝌆')
should return '𝌆'
and not '팆'
(i.e. '\uD306'
).htmlDecode('€')
should return '€'
(i.e. '\u20AC'
).htmlDecode('&amp;')
should return '&'
, not &
.For a robust solution that avoids all these issues, use a library I wrote called he for this. From its README:
he (for “HTML entities”) is a robust HTML entity encoder/decoder written in JavaScript. It supports all standardized named character references as per HTML, handles ambiguous ampersands and other edge cases just like a browser would, has an extensive test suite, and — contrary to many other JavaScript solutions — he handles astral Unicode symbols just fine. An online demo is available.
Using escape() should work with the character code range 0x00 to 0xFF (UTF-8 range).
If you go beyond 0xFF (255), such as 0x100 (256) then escape() will not work:
escape("\u0100"); // %u0100
and:
text = "\u0100"; // Ā
html = escape(text).replace(/%(..)/g,"&#x$1;"); // &#xu0;100
So, if you want to cover all Unicode charachacters as defined on http://www.w3.org/TR/html4/sgml/entities.html , then you could use something like:
var html = text.replace(/[\u00A0-\u00FF]/g, function(c) {
return '&#'+c.charCodeAt(0)+';';
});
Note here the range is between: \u00A0-\u00FF.
Thats the first character code range defined in http://www.w3.org/TR/html4/sgml/entities.html which is the same as what escape() covers.
You'll need to add the other ranges you want to cover as well, or all of them.
Example: UTF-8 range with general punctuations (\u00A0-\u00FF and \u2022-\u2135)
var html = text.replace(/[\u00A0-\u00FF\u2022-\u2135]/g, function(c) {
return '&#'+c.charCodeAt(0)+';';
});
Edit:
BTW: \u00A0-\u2666 should convert every Unicode character code not within ASCII range to HTML entities blindly:
var html = text.replace(/[\u00A0-\u2666]/g, function(c) {
return '&#'+c.charCodeAt(0)+';';
});
You can use:
function encodeHTML(str){
var aStr = str.split(''),
i = aStr.length,
aRet = [];
while (i--) {
var iC = aStr[i].charCodeAt();
if (iC < 65 || iC > 127 || (iC>90 && iC<97)) {
aRet.push('&#'+iC+';');
} else {
aRet.push(aStr[i]);
}
}
return aRet.reverse().join('');
}
This function HTMLEncodes everything that is not a-z/A-Z.
[Edit] A rather old answer. Let's add a simpler String extension to encode all extended characters:
String.prototype.encodeHTML = function () {
return this.replace(/[\u0080-\u024F]/g,
function (v) {return '&#'+v.charCodeAt()+';';}
);
}
// usage
log('Übergroße Äpfel mit Würmern'.encodeHTML());
//=> 'Übergroße Äpfel mit Würmern'
The he library is the only 100% reliable solution that I know of!
He is written by Mathias Bynens - one of the world's most renowned JavaScript gurus - and has the following features :
he.encode('foo © bar ≠ baz 𝌆 qux');
// Output : 'foo © bar ≠ baz 𝌆 qux'
he.decode('foo © bar ≠ baz 𝌆 qux');
// Output : 'foo © bar ≠ baz 𝌆 qux'
Having a lookup table with a bazillion replace() calls is slow and not maintainable.
Fortunately, the build-in escape() function also encodes most of the same characters, and puts them in a consistent format (%XX, where XX is the hex value of the character).
So, you can let escape() method do most of the work for you and just change its answer to be HTML entities instead of URL-escaped characters:
htmlescaped = escape(mystring).replace(/%(..)/g,"&#x$1;");
This uses the hex format for escaping values rather than the named entities, but for storing and displaying the values, it works just as well as named entities.
Of course, escape also escapes characters you don't need to escape in HTML (spaces, for instance), but you can unescape them with a few replace calls.
Edit: I like bucabay's answer better than my own... handles a larger range of characters, and requires no hacking afterward to get spaces, slashes, etc. unescaped.
Demo on JSFiddle
here's a tiny stand alone method that:
i don't know too much about unicode, but it seems to be working well.
// escape a string for display in html
// see also:
// polyfill for String.prototype.codePointAt
// https://raw.githubusercontent.com/mathiasbynens/String.prototype.codePointAt/master/codepointat.js
// how to convert characters to html entities
// http://stackoverflow.com/a/1354491/347508
// html overrides from
// https://html.spec.whatwg.org/multipage/syntax.html#table-charref-overrides / http://stackoverflow.com/questions/1354064/how-to-convert-characters-to-html-entities-using-plain-javascript/23831239#comment36668052_1354098
var _escape_overrides = { 0x00:'\uFFFD',0x80:'\u20AC',0x82:'\u201A',0x83:'\u0192',0x84:'\u201E',0x85:'\u2026',0x86:'\u2020',0x87:'\u2021',0x88:'\u02C6',0x89:'\u2030',0x8A:'\u0160',0x8B:'\u2039',0x8C:'\u0152',0x8E:'\u017D',0x91:'\u2018',0x92:'\u2019',0x93:'\u201C',0x94:'\u201D',0x95:'\u2022',0x96:'\u2013',0x97:'\u2014',0x98:'\u02DC',0x99:'\u2122',0x9A:'\u0161',0x9B:'\u203A',0x9C:'\u0153',0x9E:'\u017E',0x9F:'\u0178' };
function escapeHtml(str){
return str.replace(/([\u0000-\uD799]|[\uD800-\uDBFF][\uDC00-\uFFFF])/g, function(c) {
var c1 = c.charCodeAt(0);
// ascii character, use override or escape
if( c1 <= 0xFF ) return (c1=_escape_overrides[c1])?c1:escape(c).replace(/%(..)/g,"&#x$1;");
// utf8/16 character
else if( c.length == 1 ) return "&#" + c1 + ";";
// surrogate pair
else if( c.length == 2 && c1 >= 0xD800 && c1 <= 0xDBFF ) return "&#" + ((c1-0xD800)*0x400 + c.charCodeAt(1) - 0xDC00 + 0x10000) + ";"
// no clue ..
else return "";
});
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With