i was wondering if there is a MYSQL function to decode text with html entities. I have seen some approaches using REPLACE but it looks kinda hard to manage all the entities.
The MySQL DECODE() function returns empty strings if the encoded string is an empty string. The DECODE() function accepts two parameters which are the encoded string to be decoded and the password string to decode the encoded string. Syntax: DECODE(encoded_string, password_string);
Wikipedia has a good expalanation of character encodings and how some characters should be represented in HTML. Load the HTML data to decode from a file, then press the 'Decode' button: Browse: Alternatively, type or paste in the text you want to HTML–decode, then press the 'Decode' button.
HTML encoding converts characters that are not allowed in HTML into character-entity equivalents; HTML decoding reverses the encoding. For example, when embedded in a block of text, the characters < and > are encoded as < and > for HTTP transmission.
The following function can be used to decode numeric entities like 〹
CREATE FUNCTION entity_decode(txt TEXT CHARSET utf8) RETURNS TEXT CHARSET utf8
NO SQL
DETERMINISTIC
BEGIN
DECLARE tmp TEXT CHARSET utf8 DEFAULT txt;
DECLARE entity TEXT CHARSET utf8;
DECLARE pos1 INT DEFAULT 1;
DECLARE pos2 INT;
DECLARE codepoint INT;
IF txt IS NULL THEN
RETURN NULL;
END IF;
LOOP
SET pos1 = LOCATE('&#', tmp, pos1);
IF pos1 = 0 THEN
RETURN tmp;
END IF;
SET pos2 = LOCATE(';', tmp, pos1 + 2);
IF pos2 > pos1 THEN
SET entity = SUBSTRING(tmp, pos1, pos2 - pos1 + 1);
IF entity REGEXP '^&#[[:digit:]]+;$' THEN
SET codepoint = CAST(SUBSTRING(entity, 3, pos2 - pos1 - 2) AS UNSIGNED);
IF codepoint > 31 THEN
SET tmp = CONCAT(LEFT(tmp, pos1 - 1), CHAR(codepoint USING utf32), SUBSTRING(tmp, pos2 + 1));
END IF;
END IF;
END IF;
SET pos1 = pos1 + 1;
END LOOP;
END
You can create function like below
DELIMITER $$
DROP FUNCTION IF EXISTS `HTML_UnEncode`$$
CREATE FUNCTION `HTML_UnEncode`(X VARCHAR(255)) RETURNS VARCHAR(255) CHARSET latin1 DETERMINISTIC
BEGIN
DECLARE TextString VARCHAR(255) ;
SET TextString = X ;
#quotation mark
IF INSTR( X , '"' )
THEN SET TextString = REPLACE(TextString, '"','"') ;
END IF ;
#apostrophe
IF INSTR( X , ''' )
THEN SET TextString = REPLACE(TextString, ''','"') ;
END IF ;
#ampersand
IF INSTR( X , '&' )
THEN SET TextString = REPLACE(TextString, '&','&') ;
END IF ;
#less-than
IF INSTR( X , '<' )
THEN SET TextString = REPLACE(TextString, '<','<') ;
END IF ;
#greater-than
IF INSTR( X , '>' )
THEN SET TextString = REPLACE(TextString, '>','>') ;
END IF ;
#non-breaking space
IF INSTR( X , ' ' )
THEN SET TextString = REPLACE(TextString, ' ',' ') ;
END IF ;
#inverted exclamation mark
IF INSTR( X , '¡' )
THEN SET TextString = REPLACE(TextString, '¡','¡') ;
END IF ;
#cent
IF INSTR( X , '¢' )
THEN SET TextString = REPLACE(TextString, '¢','¢') ;
END IF ;
#pound
IF INSTR( X , '£' )
THEN SET TextString = REPLACE(TextString, '£','£') ;
END IF ;
#currency
IF INSTR( X , '¤' )
THEN SET TextString = REPLACE(TextString, '¤','¤') ;
END IF ;
#yen
IF INSTR( X , '¥' )
THEN SET TextString = REPLACE(TextString, '¥','¥') ;
END IF ;
#broken vertical bar
IF INSTR( X , '¦' )
THEN SET TextString = REPLACE(TextString, '¦','¦') ;
END IF ;
#section
IF INSTR( X , '§' )
THEN SET TextString = REPLACE(TextString, '§','§') ;
END IF ;
#spacing diaeresis
IF INSTR( X , '¨' )
THEN SET TextString = REPLACE(TextString, '¨','¨') ;
END IF ;
#copyright
IF INSTR( X , '©' )
THEN SET TextString = REPLACE(TextString, '©','©') ;
END IF ;
#feminine ordinal indicator
IF INSTR( X , 'ª' )
THEN SET TextString = REPLACE(TextString, 'ª','ª') ;
END IF ;
#angle quotation mark (left)
IF INSTR( X , '«' )
THEN SET TextString = REPLACE(TextString, '«','«') ;
END IF ;
#negation
IF INSTR( X , '¬' )
THEN SET TextString = REPLACE(TextString, '¬','¬') ;
END IF ;
#soft hyphen
IF INSTR( X , '­' )
THEN SET TextString = REPLACE(TextString, '­','') ;
END IF ;
#registered trademark
IF INSTR( X , '®' )
THEN SET TextString = REPLACE(TextString, '®','®') ;
END IF ;
#spacing macron
IF INSTR( X , '¯' )
THEN SET TextString = REPLACE(TextString, '¯','¯') ;
END IF ;
#degree
IF INSTR( X , '°' )
THEN SET TextString = REPLACE(TextString, '°','°') ;
END IF ;
#plus-or-minus
IF INSTR( X , '±' )
THEN SET TextString = REPLACE(TextString, '±','±') ;
END IF ;
#superscript 2
IF INSTR( X , '²' )
THEN SET TextString = REPLACE(TextString, '²','²') ;
END IF ;
#superscript 3
IF INSTR( X , '³' )
THEN SET TextString = REPLACE(TextString, '³','³') ;
END IF ;
#spacing acute
IF INSTR( X , '´' )
THEN SET TextString = REPLACE(TextString, '´','´') ;
END IF ;
#micro
IF INSTR( X , 'µ' )
THEN SET TextString = REPLACE(TextString, 'µ','µ') ;
END IF ;
#paragraph
IF INSTR( X , '¶' )
THEN SET TextString = REPLACE(TextString, '¶','¶') ;
END IF ;
#middle dot
IF INSTR( X , '·' )
THEN SET TextString = REPLACE(TextString, '·','·') ;
END IF ;
#spacing cedilla
IF INSTR( X , '¸' )
THEN SET TextString = REPLACE(TextString, '¸','¸') ;
END IF ;
#superscript 1
IF INSTR( X , '¹' )
THEN SET TextString = REPLACE(TextString, '¹','¹') ;
END IF ;
#masculine ordinal indicator
IF INSTR( X , 'º' )
THEN SET TextString = REPLACE(TextString, 'º','º') ;
END IF ;
#angle quotation mark (right)
IF INSTR( X , '»' )
THEN SET TextString = REPLACE(TextString, '»','»') ;
END IF ;
#fraction 1/4
IF INSTR( X , '¼' )
THEN SET TextString = REPLACE(TextString, '¼','¼') ;
END IF ;
#fraction 1/2
IF INSTR( X , '½' )
THEN SET TextString = REPLACE(TextString, '½','½') ;
END IF ;
#fraction 3/4
IF INSTR( X , '¾' )
THEN SET TextString = REPLACE(TextString, '¾','¾') ;
END IF ;
#inverted question mark
IF INSTR( X , '¿' )
THEN SET TextString = REPLACE(TextString, '¿','¿') ;
END IF ;
#multiplication
IF INSTR( X , '×' )
THEN SET TextString = REPLACE(TextString, '×','×') ;
END IF ;
#division
IF INSTR( X , '÷' )
THEN SET TextString = REPLACE(TextString, '÷','÷') ;
END IF ;
#capital a, grave accent
IF INSTR( X , 'À' )
THEN SET TextString = REPLACE(TextString, 'À','À') ;
END IF ;
#capital a, acute accent
IF INSTR( X , 'Á' )
THEN SET TextString = REPLACE(TextString, 'Á','Á') ;
END IF ;
#capital a, circumflex accent
IF INSTR( X , 'Â' )
THEN SET TextString = REPLACE(TextString, 'Â','Â') ;
END IF ;
#capital a, tilde
IF INSTR( X , 'Ã' )
THEN SET TextString = REPLACE(TextString, 'Ã','Ã') ;
END IF ;
#capital a, umlaut mark
IF INSTR( X , 'Ä' )
THEN SET TextString = REPLACE(TextString, 'Ä','Ä') ;
END IF ;
#capital a, ring
IF INSTR( X , 'Å' )
THEN SET TextString = REPLACE(TextString, 'Å','Å') ;
END IF ;
#capital ae
IF INSTR( X , 'Æ' )
THEN SET TextString = REPLACE(TextString, 'Æ','Æ') ;
END IF ;
#capital c, cedilla
IF INSTR( X , 'Ç' )
THEN SET TextString = REPLACE(TextString, 'Ç','Ç') ;
END IF ;
#capital e, grave accent
IF INSTR( X , 'È' )
THEN SET TextString = REPLACE(TextString, 'È','È') ;
END IF ;
#capital e, acute accent
IF INSTR( X , 'É' )
THEN SET TextString = REPLACE(TextString, 'É','É') ;
END IF ;
#capital e, circumflex accent
IF INSTR( X , 'Ê' )
THEN SET TextString = REPLACE(TextString, 'Ê','Ê') ;
END IF ;
#capital e, umlaut mark
IF INSTR( X , 'Ë' )
THEN SET TextString = REPLACE(TextString, 'Ë','Ë') ;
END IF ;
#capital i, grave accent
IF INSTR( X , 'Ì' )
THEN SET TextString = REPLACE(TextString, 'Ì','Ì') ;
END IF ;
#capital i, acute accent
IF INSTR( X , 'Í' )
THEN SET TextString = REPLACE(TextString, 'Í','Í') ;
END IF ;
#capital i, circumflex accent
IF INSTR( X , 'Î' )
THEN SET TextString = REPLACE(TextString, 'Î','Î') ;
END IF ;
#capital i, umlaut mark
IF INSTR( X , 'Ï' )
THEN SET TextString = REPLACE(TextString, 'Ï','Ï') ;
END IF ;
#capital eth, Icelandic
IF INSTR( X , 'Ð' )
THEN SET TextString = REPLACE(TextString, 'Ð','Ð') ;
END IF ;
#capital n, tilde
IF INSTR( X , 'Ñ' )
THEN SET TextString = REPLACE(TextString, 'Ñ','Ñ') ;
END IF ;
#capital o, grave accent
IF INSTR( X , 'Ò' )
THEN SET TextString = REPLACE(TextString, 'Ò','Ò') ;
END IF ;
#capital o, acute accent
IF INSTR( X , 'Ó' )
THEN SET TextString = REPLACE(TextString, 'Ó','Ó') ;
END IF ;
#capital o, circumflex accent
IF INSTR( X , 'Ô' )
THEN SET TextString = REPLACE(TextString, 'Ô','Ô') ;
END IF ;
#capital o, tilde
IF INSTR( X , 'Õ' )
THEN SET TextString = REPLACE(TextString, 'Õ','Õ') ;
END IF ;
#capital o, umlaut mark
IF INSTR( X , 'Ö' )
THEN SET TextString = REPLACE(TextString, 'Ö','Ö') ;
END IF ;
#capital o, slash
IF INSTR( X , 'Ø' )
THEN SET TextString = REPLACE(TextString, 'Ø','Ø') ;
END IF ;
#capital u, grave accent
IF INSTR( X , 'Ù' )
THEN SET TextString = REPLACE(TextString, 'Ù','Ù') ;
END IF ;
#capital u, acute accent
IF INSTR( X , 'Ú' )
THEN SET TextString = REPLACE(TextString, 'Ú','Ú') ;
END IF ;
#capital u, circumflex accent
IF INSTR( X , 'Û' )
THEN SET TextString = REPLACE(TextString, 'Û','Û') ;
END IF ;
#capital u, umlaut mark
IF INSTR( X , 'Ü' )
THEN SET TextString = REPLACE(TextString, 'Ü','Ü') ;
END IF ;
#capital y, acute accent
IF INSTR( X , 'Ý' )
THEN SET TextString = REPLACE(TextString, 'Ý','Ý') ;
END IF ;
#capital THORN, Icelandic
IF INSTR( X , 'Þ' )
THEN SET TextString = REPLACE(TextString, 'Þ','Þ') ;
END IF ;
#small sharp s, German
IF INSTR( X , 'ß' )
THEN SET TextString = REPLACE(TextString, 'ß','ß') ;
END IF ;
#small a, grave accent
IF INSTR( X , 'à' )
THEN SET TextString = REPLACE(TextString, 'à','à') ;
END IF ;
#small a, acute accent
IF INSTR( X , 'á' )
THEN SET TextString = REPLACE(TextString, 'á','á') ;
END IF ;
#small a, circumflex accent
IF INSTR( X , 'â' )
THEN SET TextString = REPLACE(TextString, 'â','â') ;
END IF ;
#small a, tilde
IF INSTR( X , 'ã' )
THEN SET TextString = REPLACE(TextString, 'ã','ã') ;
END IF ;
#small a, umlaut mark
IF INSTR( X , 'ä' )
THEN SET TextString = REPLACE(TextString, 'ä','ä') ;
END IF ;
#small a, ring
IF INSTR( X , 'å' )
THEN SET TextString = REPLACE(TextString, 'å','å') ;
END IF ;
#small ae
IF INSTR( X , 'æ' )
THEN SET TextString = REPLACE(TextString, 'æ','æ') ;
END IF ;
#small c, cedilla
IF INSTR( X , 'ç' )
THEN SET TextString = REPLACE(TextString, 'ç','ç') ;
END IF ;
#small e, grave accent
IF INSTR( X , 'è' )
THEN SET TextString = REPLACE(TextString, 'è','è') ;
END IF ;
#small e, acute accent
IF INSTR( X , 'é' )
THEN SET TextString = REPLACE(TextString, 'é','é') ;
END IF ;
#small e, circumflex accent
IF INSTR( X , 'ê' )
THEN SET TextString = REPLACE(TextString, 'ê','ê') ;
END IF ;
#small e, umlaut mark
IF INSTR( X , 'ë' )
THEN SET TextString = REPLACE(TextString, 'ë','ë') ;
END IF ;
#small i, grave accent
IF INSTR( X , 'ì' )
THEN SET TextString = REPLACE(TextString, 'ì','ì') ;
END IF ;
#small i, acute accent
IF INSTR( X , 'í' )
THEN SET TextString = REPLACE(TextString, 'í','í') ;
END IF ;
#small i, circumflex accent
IF INSTR( X , 'î' )
THEN SET TextString = REPLACE(TextString, 'î','î') ;
END IF ;
#small i, umlaut mark
IF INSTR( X , 'ï' )
THEN SET TextString = REPLACE(TextString, 'ï','ï') ;
END IF ;
#small eth, Icelandic
IF INSTR( X , 'ð' )
THEN SET TextString = REPLACE(TextString, 'ð','ð') ;
END IF ;
#small n, tilde
IF INSTR( X , 'ñ' )
THEN SET TextString = REPLACE(TextString, 'ñ','ñ') ;
END IF ;
#small o, grave accent
IF INSTR( X , 'ò' )
THEN SET TextString = REPLACE(TextString, 'ò','ò') ;
END IF ;
#small o, acute accent
IF INSTR( X , 'ó' )
THEN SET TextString = REPLACE(TextString, 'ó','ó') ;
END IF ;
#small o, circumflex accent
IF INSTR( X , 'ô' )
THEN SET TextString = REPLACE(TextString, 'ô','ô') ;
END IF ;
#small o, tilde
IF INSTR( X , 'õ' )
THEN SET TextString = REPLACE(TextString, 'õ','õ') ;
END IF ;
#small o, umlaut mark
IF INSTR( X , 'ö' )
THEN SET TextString = REPLACE(TextString, 'ö','ö') ;
END IF ;
#small o, slash
IF INSTR( X , 'ø' )
THEN SET TextString = REPLACE(TextString, 'ø','ø') ;
END IF ;
#small u, grave accent
IF INSTR( X , 'ù' )
THEN SET TextString = REPLACE(TextString, 'ù','ù') ;
END IF ;
#small u, acute accent
IF INSTR( X , 'ú' )
THEN SET TextString = REPLACE(TextString, 'ú','ú') ;
END IF ;
#small u, circumflex accent
IF INSTR( X , 'û' )
THEN SET TextString = REPLACE(TextString, 'û','û') ;
END IF ;
#small u, umlaut mark
IF INSTR( X , 'ü' )
THEN SET TextString = REPLACE(TextString, 'ü','ü') ;
END IF ;
#small y, acute accent
IF INSTR( X , 'ý' )
THEN SET TextString = REPLACE(TextString, 'ý','ý') ;
END IF ;
#small thorn, Icelandic
IF INSTR( X , 'þ' )
THEN SET TextString = REPLACE(TextString, 'þ','þ') ;
END IF ;
#small y, umlaut mark
IF INSTR( X , 'ÿ' )
THEN SET TextString = REPLACE(TextString, 'ÿ','ÿ') ;
END IF ;
RETURN TextString ;
END$$
DELIMITER ;
And execute below query :
SELECT HTML_UnEncode('this is a <a>test, nothing more');
This adds support for hex encoded numeric values (which I get from Perl using "use HTML::Entities qw(encode_entities_numeric);".
DELIMITER $$
CREATE FUNCTION entity_decode(txt TEXT CHARSET utf8) RETURNS TEXT CHARSET utf8
NO SQL
DETERMINISTIC
BEGIN
DECLARE tmp TEXT CHARSET utf8 DEFAULT txt;
DECLARE entity TEXT CHARSET utf8;
DECLARE pos1 INT DEFAULT 1;
DECLARE pos2 INT;
DECLARE codepoint INT;
IF txt IS NULL THEN
RETURN NULL;
END IF;
LOOP
SET pos1 = LOCATE('&#', tmp, pos1);
IF pos1 = 0 THEN
RETURN tmp;
END IF;
SET pos2 = LOCATE(';', tmp, pos1 + 2);
IF pos2 > pos1 THEN
SET entity = SUBSTRING(tmp, pos1, pos2 - pos1 + 1);
IF entity REGEXP '^&#[[:digit:]]+;$' THEN
SET codepoint = CAST(SUBSTRING(entity, 3, pos2 - pos1 - 2) AS UNSIGNED);
IF codepoint > 31 THEN
SET tmp = CONCAT(LEFT(tmp, pos1 - 1), CHAR(codepoint USING utf32), SUBSTRING(tmp, pos2 + 1));
END IF;
END IF;
IF entity REGEXP '^&#x[[:digit:]]+;$' THEN
SET codepoint = CAST(CONV(SUBSTRING(entity, 4, pos2 - pos1 - 3), 16, 10) AS UNSIGNED);
IF codepoint > 31 THEN
SET tmp = CONCAT(LEFT(tmp, pos1 - 1), CHAR(codepoint USING utf32), SUBSTRING(tmp, pos2 + 1));
END IF;
END IF;
END IF;
SET pos1 = pos1 + 1;
END LOOP;
END$$
DELIMITER ;
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With