HTML encoding converts characters that are not allowed in HTML into character-entity equivalents; HTML decoding reverses the encoding. For example, when embedded in a block of text, the characters < and > are encoded as < and > for HTTP transmission.
Wikipedia has a good expalanation of character encodings and how some characters should be represented in HTML. Load the HTML data to decode from a file, then press the 'Decode' button: Browse: Alternatively, type or paste in the text you want to HTML–decode, then press the 'Decode' button.
HTML DecodeThe encoded characters are converted back to their original form in the decoding process. It decodes a string that contains HTML numeric character references and returns the decoded string. You can also choose to convert HTML code into JavaScript string.
This answer was last revised for Swift 5.2 and iOS 13.4 SDK.
There's no straightforward way to do that, but you can use NSAttributedString
magic to make this process as painless as possible (be warned that this method will strip all HTML tags as well).
Remember to initialize NSAttributedString
from main thread only. It uses WebKit to parse HTML underneath, thus the requirement.
// This is a[0]["title"] in your case
let encodedString = "The Weeknd <em>‘King Of The Fall’</em>"
guard let data = htmlEncodedString.data(using: .utf8) else {
return
}
let options: [NSAttributedString.DocumentReadingOptionKey: Any] = [
.documentType: NSAttributedString.DocumentType.html,
.characterEncoding: String.Encoding.utf8.rawValue
]
guard let attributedString = try? NSAttributedString(data: data, options: options, documentAttributes: nil) else {
return
}
// The Weeknd ‘King Of The Fall’
let decodedString = attributedString.string
extension String {
init?(htmlEncodedString: String) {
guard let data = htmlEncodedString.data(using: .utf8) else {
return nil
}
let options: [NSAttributedString.DocumentReadingOptionKey: Any] = [
.documentType: NSAttributedString.DocumentType.html,
.characterEncoding: String.Encoding.utf8.rawValue
]
guard let attributedString = try? NSAttributedString(data: data, options: options, documentAttributes: nil) else {
return nil
}
self.init(attributedString.string)
}
}
let encodedString = "The Weeknd <em>‘King Of The Fall’</em>"
let decodedString = String(htmlEncodedString: encodedString)
@akashivskyy's answer is great and demonstrates how to utilize NSAttributedString
to decode HTML entities. One possible disadvantage
(as he stated) is that all HTML markup is removed as well, so
<strong> 4 < 5 & 3 > 2</strong>
becomes
4 < 5 & 3 > 2
On OS X there is CFXMLCreateStringByUnescapingEntities()
which does the job:
let encoded = "<strong> 4 < 5 & 3 > 2 .</strong> Price: 12 €. @ "
let decoded = CFXMLCreateStringByUnescapingEntities(nil, encoded, nil) as String
println(decoded)
// <strong> 4 < 5 & 3 > 2 .</strong> Price: 12 €. @
but this is not available on iOS.
Here is a pure Swift implementation. It decodes character entities
references like <
using a dictionary, and all numeric character
entities like @
or €
. (Note that I did not list all
252 HTML entities explicitly.)
Swift 4:
// Mapping from XML/HTML character entity reference to character
// From http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
private let characterEntities : [ Substring : Character ] = [
// XML predefined entities:
""" : "\"",
"&" : "&",
"'" : "'",
"<" : "<",
">" : ">",
// HTML character entity references:
" " : "\u{00a0}",
// ...
"♦" : "♦",
]
extension String {
/// Returns a new string made by replacing in the `String`
/// all HTML character entity references with the corresponding
/// character.
var stringByDecodingHTMLEntities : String {
// ===== Utility functions =====
// Convert the number in the string to the corresponding
// Unicode character, e.g.
// decodeNumeric("64", 10) --> "@"
// decodeNumeric("20ac", 16) --> "€"
func decodeNumeric(_ string : Substring, base : Int) -> Character? {
guard let code = UInt32(string, radix: base),
let uniScalar = UnicodeScalar(code) else { return nil }
return Character(uniScalar)
}
// Decode the HTML character entity to the corresponding
// Unicode character, return `nil` for invalid input.
// decode("@") --> "@"
// decode("€") --> "€"
// decode("<") --> "<"
// decode("&foo;") --> nil
func decode(_ entity : Substring) -> Character? {
if entity.hasPrefix("&#x") || entity.hasPrefix("&#X") {
return decodeNumeric(entity.dropFirst(3).dropLast(), base: 16)
} else if entity.hasPrefix("&#") {
return decodeNumeric(entity.dropFirst(2).dropLast(), base: 10)
} else {
return characterEntities[entity]
}
}
// ===== Method starts here =====
var result = ""
var position = startIndex
// Find the next '&' and copy the characters preceding it to `result`:
while let ampRange = self[position...].range(of: "&") {
result.append(contentsOf: self[position ..< ampRange.lowerBound])
position = ampRange.lowerBound
// Find the next ';' and copy everything from '&' to ';' into `entity`
guard let semiRange = self[position...].range(of: ";") else {
// No matching ';'.
break
}
let entity = self[position ..< semiRange.upperBound]
position = semiRange.upperBound
if let decoded = decode(entity) {
// Replace by decoded character:
result.append(decoded)
} else {
// Invalid entity, copy verbatim:
result.append(contentsOf: entity)
}
}
// Copy remaining characters to `result`:
result.append(contentsOf: self[position...])
return result
}
}
Example:
let encoded = "<strong> 4 < 5 & 3 > 2 .</strong> Price: 12 €. @ "
let decoded = encoded.stringByDecodingHTMLEntities
print(decoded)
// <strong> 4 < 5 & 3 > 2 .</strong> Price: 12 €. @
Swift 3:
// Mapping from XML/HTML character entity reference to character
// From http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
private let characterEntities : [ String : Character ] = [
// XML predefined entities:
""" : "\"",
"&" : "&",
"'" : "'",
"<" : "<",
">" : ">",
// HTML character entity references:
" " : "\u{00a0}",
// ...
"♦" : "♦",
]
extension String {
/// Returns a new string made by replacing in the `String`
/// all HTML character entity references with the corresponding
/// character.
var stringByDecodingHTMLEntities : String {
// ===== Utility functions =====
// Convert the number in the string to the corresponding
// Unicode character, e.g.
// decodeNumeric("64", 10) --> "@"
// decodeNumeric("20ac", 16) --> "€"
func decodeNumeric(_ string : String, base : Int) -> Character? {
guard let code = UInt32(string, radix: base),
let uniScalar = UnicodeScalar(code) else { return nil }
return Character(uniScalar)
}
// Decode the HTML character entity to the corresponding
// Unicode character, return `nil` for invalid input.
// decode("@") --> "@"
// decode("€") --> "€"
// decode("<") --> "<"
// decode("&foo;") --> nil
func decode(_ entity : String) -> Character? {
if entity.hasPrefix("&#x") || entity.hasPrefix("&#X"){
return decodeNumeric(entity.substring(with: entity.index(entity.startIndex, offsetBy: 3) ..< entity.index(entity.endIndex, offsetBy: -1)), base: 16)
} else if entity.hasPrefix("&#") {
return decodeNumeric(entity.substring(with: entity.index(entity.startIndex, offsetBy: 2) ..< entity.index(entity.endIndex, offsetBy: -1)), base: 10)
} else {
return characterEntities[entity]
}
}
// ===== Method starts here =====
var result = ""
var position = startIndex
// Find the next '&' and copy the characters preceding it to `result`:
while let ampRange = self.range(of: "&", range: position ..< endIndex) {
result.append(self[position ..< ampRange.lowerBound])
position = ampRange.lowerBound
// Find the next ';' and copy everything from '&' to ';' into `entity`
if let semiRange = self.range(of: ";", range: position ..< endIndex) {
let entity = self[position ..< semiRange.upperBound]
position = semiRange.upperBound
if let decoded = decode(entity) {
// Replace by decoded character:
result.append(decoded)
} else {
// Invalid entity, copy verbatim:
result.append(entity)
}
} else {
// No matching ';'.
break
}
}
// Copy remaining characters to `result`:
result.append(self[position ..< endIndex])
return result
}
}
Swift 2:
// Mapping from XML/HTML character entity reference to character
// From http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
private let characterEntities : [ String : Character ] = [
// XML predefined entities:
""" : "\"",
"&" : "&",
"'" : "'",
"<" : "<",
">" : ">",
// HTML character entity references:
" " : "\u{00a0}",
// ...
"♦" : "♦",
]
extension String {
/// Returns a new string made by replacing in the `String`
/// all HTML character entity references with the corresponding
/// character.
var stringByDecodingHTMLEntities : String {
// ===== Utility functions =====
// Convert the number in the string to the corresponding
// Unicode character, e.g.
// decodeNumeric("64", 10) --> "@"
// decodeNumeric("20ac", 16) --> "€"
func decodeNumeric(string : String, base : Int32) -> Character? {
let code = UInt32(strtoul(string, nil, base))
return Character(UnicodeScalar(code))
}
// Decode the HTML character entity to the corresponding
// Unicode character, return `nil` for invalid input.
// decode("@") --> "@"
// decode("€") --> "€"
// decode("<") --> "<"
// decode("&foo;") --> nil
func decode(entity : String) -> Character? {
if entity.hasPrefix("&#x") || entity.hasPrefix("&#X"){
return decodeNumeric(entity.substringFromIndex(entity.startIndex.advancedBy(3)), base: 16)
} else if entity.hasPrefix("&#") {
return decodeNumeric(entity.substringFromIndex(entity.startIndex.advancedBy(2)), base: 10)
} else {
return characterEntities[entity]
}
}
// ===== Method starts here =====
var result = ""
var position = startIndex
// Find the next '&' and copy the characters preceding it to `result`:
while let ampRange = self.rangeOfString("&", range: position ..< endIndex) {
result.appendContentsOf(self[position ..< ampRange.startIndex])
position = ampRange.startIndex
// Find the next ';' and copy everything from '&' to ';' into `entity`
if let semiRange = self.rangeOfString(";", range: position ..< endIndex) {
let entity = self[position ..< semiRange.endIndex]
position = semiRange.endIndex
if let decoded = decode(entity) {
// Replace by decoded character:
result.append(decoded)
} else {
// Invalid entity, copy verbatim:
result.appendContentsOf(entity)
}
} else {
// No matching ';'.
break
}
}
// Copy remaining characters to `result`:
result.appendContentsOf(self[position ..< endIndex])
return result
}
}
Swift 4
extension String {
var htmlDecoded: String {
let decoded = try? NSAttributedString(data: Data(utf8), options: [
.documentType: NSAttributedString.DocumentType.html,
.characterEncoding: String.Encoding.utf8.rawValue
], documentAttributes: nil).string
return decoded ?? self
}
}
Swift 3 version of @akashivskyy's extension,
extension String {
init(htmlEncodedString: String) {
self.init()
guard let encodedData = htmlEncodedString.data(using: .utf8) else {
self = htmlEncodedString
return
}
let attributedOptions: [String : Any] = [
NSDocumentTypeDocumentAttribute: NSHTMLTextDocumentType,
NSCharacterEncodingDocumentAttribute: String.Encoding.utf8.rawValue
]
do {
let attributedString = try NSAttributedString(data: encodedData, options: attributedOptions, documentAttributes: nil)
self = attributedString.string
} catch {
print("Error: \(error)")
self = htmlEncodedString
}
}
}
Swift 2 version of @akashivskyy's extension,
extension String {
init(htmlEncodedString: String) {
if let encodedData = htmlEncodedString.dataUsingEncoding(NSUTF8StringEncoding){
let attributedOptions : [String: AnyObject] = [
NSDocumentTypeDocumentAttribute: NSHTMLTextDocumentType,
NSCharacterEncodingDocumentAttribute: NSUTF8StringEncoding
]
do{
if let attributedString:NSAttributedString = try NSAttributedString(data: encodedData, options: attributedOptions, documentAttributes: nil){
self.init(attributedString.string)
}else{
print("error")
self.init(htmlEncodedString) //Returning actual string if there is an error
}
}catch{
print("error: \(error)")
self.init(htmlEncodedString) //Returning actual string if there is an error
}
}else{
self.init(htmlEncodedString) //Returning actual string if there is an error
}
}
}
extension String {
init(htmlEncodedString: String) {
self.init()
guard let encodedData = htmlEncodedString.data(using: .utf8) else {
self = htmlEncodedString
return
}
let attributedOptions: [NSAttributedString.DocumentReadingOptionKey : Any] = [
.documentType: NSAttributedString.DocumentType.html,
.characterEncoding: String.Encoding.utf8.rawValue
]
do {
let attributedString = try NSAttributedString(data: encodedData, options: attributedOptions, documentAttributes: nil)
self = attributedString.string
}
catch {
print("Error: \(error)")
self = htmlEncodedString
}
}
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With