I'm grabbing some tweets and printing them out on my site and curly apostrophes are being rendered as "â€tm". This is not good. What php function should I run the string through to get these weird characters to display as something closer to '?
I battled with this for almost a day and then found that this function will work 100% of the time. It works with utf-8 and unicode and converts characters that are beyond the base ascii set into their html entities. It's good for cleaning up MS Word rubbish.
function filterText($text)
{
//UTF-8 filter
$conv = array(
"\xC2\xA0" => ' ',
"\xC2\xA1" => '¡',
"\xC2\xA2" => '¢',
"\xC2\xA3" => '£',
"\xC2\xA4" => '¤',
"\xC2\xA5" => '¥',
"\xC2\xA6" => '¦',
"\xC2\xA7" => '§',
"\xC2\xA8" => '¨',
"\xC2\xA9" => '©',
"\xC2\xAA" => 'ª',
"\xC2\xAB" => '«',
"\xC2\xAC" => '¬',
"\xC2\xAD" => '­',
"\xC2\xAE" => '®',
"\xC2\xAF" => '¯',
"\xC2\xB0" => '°',
"\xC2\xB1" => '±',
"\xC2\xB2" => '²',
"\xC2\xB3" => '³',
"\xC2\xB4" => '´',
"\xC2\xB5" => 'µ',
"\xC2\xB6" => '¶',
"\xC2\xB7" => '·',
"\xC2\xB8" => '¸',
"\xC2\xB9" => '¹',
"\xC2\xBA" => 'º',
"\xC2\xBB" => '»',
"\xC2\xBC" => '¼',
"\xC2\xBD" => '½',
"\xC2\xBE" => '¾',
"\xC2\xBF" => '¿',
"\xC3\x80" => 'À',
"\xC3\x81" => 'Á',
"\xC3\x82" => 'Â',
"\xC3\x83" => 'Ã',
"\xC3\x84" => 'Ä',
"\xC3\x85" => 'Å',
"\xC3\x86" => 'Æ',
"\xC3\x87" => 'Ç',
"\xC3\x88" => 'È',
"\xC3\x89" => 'É',
"\xC3\x8A" => 'Ê',
"\xC3\x8B" => 'Ë',
"\xC3\x8C" => 'Ì',
"\xC3\x8D" => 'Í',
"\xC3\x8E" => 'Î',
"\xC3\x8F" => 'Ï',
"\xC3\x90" => 'Ð',
"\xC3\x91" => 'Ñ',
"\xC3\x92" => 'Ò',
"\xC3\x93" => 'Ó',
"\xC3\x94" => 'Ô',
"\xC3\x95" => 'Õ',
"\xC3\x96" => 'Ö',
"\xC3\x97" => '×',
"\xC3\x98" => 'Ø',
"\xC3\x99" => 'Ù',
"\xC3\x9A" => 'Ú',
"\xC3\x9B" => 'Û',
"\xC3\x9C" => 'Ü',
"\xC3\x9D" => 'Ý',
"\xC3\x9E" => 'Þ',
"\xC3\x9F" => 'ß',
"\xC3\xA0" => 'à',
"\xC3\xA1" => 'á',
"\xC3\xA2" => 'â',
"\xC3\xA3" => 'ã',
"\xC3\xA4" => 'ä',
"\xC3\xA5" => 'å',
"\xC3\xA6" => 'æ',
"\xC3\xA7" => 'ç',
"\xC3\xA8" => 'è',
"\xC3\xA9" => 'é',
"\xC3\xAA" => 'ê',
"\xC3\xAB" => 'ë',
"\xC3\xAC" => 'ì',
"\xC3\xAD" => 'í',
"\xC3\xAE" => 'î',
"\xC3\xAF" => 'ï',
"\xC3\xB0" => 'ð',
"\xC3\xB1" => 'ñ',
"\xC3\xB2" => 'ò',
"\xC3\xB3" => 'ó',
"\xC3\xB4" => 'ô',
"\xC3\xB5" => 'õ',
"\xC3\xB6" => 'ö',
"\xC3\xB7" => '÷',
"\xC3\xB8" => 'ø',
"\xC3\xB9" => 'ù',
"\xC3\xBA" => 'ú',
"\xC3\xBB" => 'û',
"\xC3\xBC" => 'ü',
"\xC3\xBD" => 'ý',
"\xC3\xBE" => 'þ',
"\xC3\xBF" => 'ÿ',
// Latin Extended-A
"\xC5\x92" => 'Œ',
"\xC5\x93" => 'œ',
"\xC5\xA0" => 'Š',
"\xC5\xA1" => 'š',
"\xC5\xB8" => 'Ÿ',
// Spacing Modifier Letters
"\xCB\x86" => 'ˆ',
"\xCB\x9C" => '˜',
// General Punctuation
"\xE2\x80\x82" => ' ',
"\xE2\x80\x83" => ' ',
"\xE2\x80\x89" => ' ',
"\xE2\x80\x8C" => '‌',
"\xE2\x80\x8D" => '‍',
"\xE2\x80\x8E" => '‎',
"\xE2\x80\x8F" => '‏',
"\xE2\x80\x93" => '–',
"\xE2\x80\x94" => '—',
"\xE2\x80\x98" => '‘',
"\xE2\x80\x99" => '’',
"\xE2\x80\x9A" => '‚',
"\xE2\x80\x9C" => '“',
"\xE2\x80\x9D" => '”',
"\xE2\x80\x9E" => '„',
"\xE2\x80\xA0" => '†',
"\xE2\x80\xA1" => '‡',
"\xE2\x80\xB0" => '‰',
"\xE2\x80\xB9" => '‹',
"\xE2\x80\xBA" => '›',
"\xE2\x82\xAC" => '€',
// Latin Extended-B
"\xC6\x92" => 'ƒ',
// Greek
"\xCE\x91" => 'Α',
"\xCE\x92" => 'Β',
"\xCE\x93" => 'Γ',
"\xCE\x94" => 'Δ',
"\xCE\x95" => 'Ε',
"\xCE\x96" => 'Ζ',
"\xCE\x97" => 'Η',
"\xCE\x98" => 'Θ',
"\xCE\x99" => 'Ι',
"\xCE\x9A" => 'Κ',
"\xCE\x9B" => 'Λ',
"\xCE\x9C" => 'Μ',
"\xCE\x9D" => 'Ν',
"\xCE\x9E" => 'Ξ',
"\xCE\x9F" => 'Ο',
"\xCE\xA0" => 'Π',
"\xCE\xA1" => 'Ρ',
"\xCE\xA3" => 'Σ',
"\xCE\xA4" => 'Τ',
"\xCE\xA5" => 'Υ',
"\xCE\xA6" => 'Φ',
"\xCE\xA7" => 'Χ',
"\xCE\xA8" => 'Ψ',
"\xCE\xA9" => 'Ω',
"\xCE\xB1" => 'α',
"\xCE\xB2" => 'β',
"\xCE\xB3" => 'γ',
"\xCE\xB4" => 'δ',
"\xCE\xB5" => 'ε',
"\xCE\xB6" => 'ζ',
"\xCE\xB7" => 'η',
"\xCE\xB8" => 'θ',
"\xCE\xB9" => 'ι',
"\xCE\xBA" => 'κ',
"\xCE\xBB" => 'λ',
"\xCE\xBC" => 'μ',
"\xCE\xBD" => 'ν',
"\xCE\xBE" => 'ξ',
"\xCE\xBF" => 'ο',
"\xCF\x80" => 'π',
"\xCF\x81" => 'ρ',
"\xCF\x82" => 'ς',
"\xCF\x83" => 'σ',
"\xCF\x84" => 'τ',
"\xCF\x85" => 'υ',
"\xCF\x86" => 'φ',
"\xCF\x87" => 'χ',
"\xCF\x88" => 'ψ',
"\xCF\x89" => 'ω',
"\xCF\x91" => 'ϑ',
"\xCF\x92" => 'ϒ',
"\xCF\x96" => 'ϖ',
// General Punctuation
"\xE2\x80\xA2" => '•',
"\xE2\x80\xA6" => '…',
"\xE2\x80\xB2" => '′',
"\xE2\x80\xB3" => '″',
"\xE2\x80\xBE" => '‾',
"\xE2\x81\x84" => '⁄',
// Letterlike Symbols
"\xE2\x84\x98" => '℘',
"\xE2\x84\x91" => 'ℑ',
"\xE2\x84\x9C" => 'ℜ',
"\xE2\x84\xA2" => '™',
"\xE2\x84\xB5" => 'ℵ',
// Arrows
"\xE2\x86\x90" => '←',
"\xE2\x86\x91" => '↑',
"\xE2\x86\x92" => '→',
"\xE2\x86\x93" => '↓',
"\xE2\x86\x94" => '↔',
"\xE2\x86\xB5" => '↵',
"\xE2\x87\x90" => '⇐',
"\xE2\x87\x91" => '⇑',
"\xE2\x87\x92" => '⇒',
"\xE2\x87\x93" => '⇓',
"\xE2\x87\x94" => '⇔',
// Mathematical Operators
"\xE2\x88\x80" => '∀',
"\xE2\x88\x82" => '∂',
"\xE2\x88\x83" => '∃',
"\xE2\x88\x85" => '∅',
"\xE2\x88\x87" => '∇',
"\xE2\x88\x88" => '∈',
"\xE2\x88\x89" => '∉',
"\xE2\x88\x8B" => '∋',
"\xE2\x88\x8F" => '∏',
"\xE2\x88\x91" => '∑',
"\xE2\x88\x92" => '−',
"\xE2\x88\x97" => '∗',
"\xE2\x88\x9A" => '√',
"\xE2\x88\x9D" => '∝',
"\xE2\x88\x9E" => '∞',
"\xE2\x88\xA0" => '∠',
"\xE2\x88\xA7" => '∧',
"\xE2\x88\xA8" => '∨',
"\xE2\x88\xA9" => '∩',
"\xE2\x88\xAA" => '∪',
"\xE2\x88\xAB" => '∫',
"\xE2\x88\xB4" => '∴',
"\xE2\x88\xBC" => '∼',
"\xE2\x89\x85" => '≅',
"\xE2\x89\x88" => '≈',
"\xE2\x89\xA0" => '≠',
"\xE2\x89\xA1" => '≡',
"\xE2\x89\xA4" => '≤',
"\xE2\x89\xA5" => '≥',
"\xE2\x8A\x82" => '⊂',
"\xE2\x8A\x83" => '⊃',
"\xE2\x8A\x84" => '⊄',
"\xE2\x8A\x86" => '⊆',
"\xE2\x8A\x87" => '⊇',
"\xE2\x8A\x95" => '⊕',
"\xE2\x8A\x97" => '⊗',
"\xE2\x8A\xA5" => '⊥',
"\xE2\x8B\x85" => '⋅',
// Miscellaneous Technical
"\xE2\x8C\x88" => '⌈',
"\xE2\x8C\x89" => '⌉',
"\xE2\x8C\x8A" => '⌊',
"\xE2\x8C\x8B" => '⌋',
"\xE2\x8C\xA9" => '⟨',
"\xE2\x8C\xAA" => '⟩',
// Geometric Shapes
"\xE2\x97\x8A" => '◊',
// Miscellaneous Symbols
"\xE2\x99\xA0" => '♠',
"\xE2\x99\xA3" => '♣',
"\xE2\x99\xA5" => '♥',
"\xE2\x99\xA6" => '♦'
);
$string = strtr($text, $conv);
//now translate any unicode stuff...
$conv = array(
chr(128) => "€",
chr(130) => "‚",
chr(131) => "ƒ",
chr(132) => "„",
chr(133) => "…",
chr(134) => "†",
chr(135) => "‡",
chr(136) => "ˆ",
chr(137) => "‰",
chr(138) => "Š",
chr(139) => "‹",
chr(140) => "Œ",
chr(145) => "‘",
chr(146) => "’",
chr(147) => "“",
chr(148) => "”",
chr(149) => "•",
chr(150) => "–",
chr(151) => "—",
chr(152) => "˜",
chr(153) => "™",
chr(154) => "š",
chr(155) => "›",
chr(156) => "œ",
chr(159) => "ÿ",
chr(160) => " ",
chr(161) => "¡",
chr(162) => "¢",
chr(163) => "£",
chr(164) => "¤",
chr(165) => "¥",
chr(166) => "¦",
chr(167) => "§",
chr(168) => "¨",
chr(169) => "©",
chr(170) => "ª",
chr(171) => "«",
chr(172) => "¬",
chr(173) => "­",
chr(174) => "®",
chr(175) => "¯",
chr(176) => "°",
chr(177) => "±",
chr(178) => "²",
chr(179) => "³",
chr(180) => "´",
chr(181) => "µ",
chr(182) => "¶",
chr(183) => "·",
chr(184) => "¸",
chr(185) => "¹",
chr(186) => "º",
chr(187) => "»",
chr(188) => "¼",
chr(189) => "½",
chr(190) => "¾",
chr(191) => "¿",
chr(192) => "À",
chr(193) => "Á",
chr(194) => "Â",
chr(195) => "Ã",
chr(196) => "Ä",
chr(197) => "Å",
chr(198) => "Æ",
chr(199) => "Ç",
chr(200) => "È",
chr(201) => "É",
chr(202) => "Ê",
chr(203) => "Ë",
chr(204) => "Ì",
chr(205) => "Í",
chr(206) => "Î",
chr(207) => "Ï",
chr(208) => "Ð",
chr(209) => "Ñ",
chr(210) => "Ò",
chr(211) => "Ó",
chr(212) => "Ô",
chr(213) => "Õ",
chr(214) => "Ö",
chr(215) => "×",
chr(216) => "Ø",
chr(217) => "Ù",
chr(218) => "Ú",
chr(219) => "Û",
chr(220) => "Ü",
chr(221) => "Ý",
chr(222) => "Þ",
chr(223) => "ß",
chr(224) => "à",
chr(225) => "á",
chr(226) => "â",
chr(227) => "ã",
chr(228) => "ä",
chr(229) => "å",
chr(230) => "æ",
chr(231) => "ç",
chr(232) => "è",
chr(233) => "é",
chr(234) => "ê",
chr(235) => "ë",
chr(236) => "ì",
chr(237) => "í",
chr(238) => "î",
chr(239) => "ï",
chr(240) => "ð",
chr(241) => "ñ",
chr(242) => "ò",
chr(243) => "ó",
chr(244) => "ô",
chr(245) => "õ",
chr(246) => "ö",
chr(247) => "÷",
chr(248) => "ø",
chr(249) => "ù",
chr(250) => "ú",
chr(251) => "û",
chr(252) => "ü",
chr(253) => "ý",
chr(254) => "þ",
chr(255) => "ÿ");
return strtr($string, $conv);
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With