There are tons of entries and answers online about this, but they're all going the opposite direction of what I need. From my iTunes XML, I have thousands of percent-encoded entries, in multiple languages, that I'm trying to convert, with an XSLT stylesheet, to Unicode text. Is there any function or process that I'm missing, other than tracking down every single character and doing a replace? Here is a small sample of some examples of the variety that I'm working with, the first line is the XML string value, the following line is the basic text that I'm trying to generate, and output to a text file.
<string>/iTunes/iTunes%20Music/Droit%20devant/L'odysse%CC%81e.mp3</string>
/iTunes/iTunes Music/Droit devant/L'odyssée.mp3
<string>A%CC%80%20la%20Pe%CC%82che</string>
À la Pêche
<string>%D0%97%D0%B0%D0%BF%D0%BE%D0%BC%D0%B8%D0%BD%D0%B0%D0%B8%CC%86</string>
Запоминай
<string>%CE%9A%CE%BF%CC%81%CF%84%CF%83%CC%8C%CE%B1%CF%81%CE%B9</string>
Κότσ̌αρι
This last one may not display properly for some, because of the overstriking hacek/caron.
Thanks in advance for any advice or leads
A pure XSLT 2.0 solution could make use of the string-to-codepoints() and the codepoints-to-string() functions. The utf-8 decoding is a bit messy, it can be done.
This XSLT 2.0 style-sheet...
<xsl:stylesheet version="2.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:so="http://stackoverflow.com/questions/13768754"
exclude-result-prefixes="xsl xs so">
<xsl:output encoding="UTF-8" omit-xml-declaration="yes" indent="yes" />
<xsl:strip-space elements="*"/>
<xsl:variable name="cp-base" select="string-to-codepoints('0A')" as="xs:integer+" />
<xsl:template match="@*|node()">
<xsl:copy>
<xsl:apply-templates select="@*|node()" />
</xsl:copy>
</xsl:template>
<xsl:function name="so:utf8decode" as="xs:integer*">
<xsl:param name="bytes" as="xs:integer*" />
<xsl:choose>
<xsl:when test="empty($bytes)" />
<xsl:when test="$bytes[1] eq 0"><!-- The null character is not valid for XML. -->
<xsl:sequence select="so:utf8decode( remove( $bytes, 1))" />
</xsl:when>
<xsl:when test="$bytes[1] le 127">
<xsl:sequence select="$bytes[1], so:utf8decode( remove( $bytes, 1))" />
</xsl:when>
<xsl:when test="$bytes[1] lt 224">
<xsl:sequence select="
((($bytes[1] - 192) * 64) +
($bytes[2] - 128) ),
so:utf8decode( remove( remove( $bytes, 1), 1))" />
</xsl:when>
<xsl:when test="$bytes[1] lt 240">
<xsl:sequence select="
((($bytes[1] - 224) * 4096) +
(($bytes[2] - 128) * 64) +
($bytes[3] - 128) ),
so:utf8decode( remove( remove( remove( $bytes, 1), 1), 1))" />
</xsl:when>
<xsl:when test="$bytes[1] lt 248">
<xsl:sequence select="
((($bytes[1] - 240) * 262144) +
(($bytes[2] - 128) * 4096) +
(($bytes[3] - 128) * 64) +
($bytes[4] - 128) ),
so:utf8decode( $bytes[position() gt 4])" />
</xsl:when>
<xsl:otherwise>
<!-- Code-point valid for XML. -->
<xsl:sequence select="so:utf8decode( remove( $bytes, 1))" />
</xsl:otherwise>
</xsl:choose>
</xsl:function>
<xsl:template match="string/text()">
<xsl:analyze-string select="." regex="(%[0-9A-F]{{2}})+" flags="i">
<xsl:matching-substring>
<xsl:variable name="utf8-bytes" as="xs:integer+">
<xsl:analyze-string select="." regex="%([0-9A-F]{{2}})" flags="i">
<xsl:matching-substring>
<xsl:variable name="nibble-pair" select="
for $nibble-char in string-to-codepoints( upper-case(regex-group(1))) return
if ($nibble-char ge $cp-base[2]) then
$nibble-char - $cp-base[2] + 10
else
$nibble-char - $cp-base[1]" as="xs:integer+" />
<xsl:sequence select="$nibble-pair[1] * 16 + $nibble-pair[2]" />
</xsl:matching-substring>
</xsl:analyze-string>
</xsl:variable>
<xsl:value-of select="codepoints-to-string( so:utf8decode( $utf8-bytes))" />
</xsl:matching-substring>
<xsl:non-matching-substring>
<xsl:value-of select="." />
</xsl:non-matching-substring>
<xsl:fallback>
<!-- For XSLT 1.0 operating in forward compatibility mode,
just echo -->
<xsl:value-of select="." />
</xsl:fallback>
</xsl:analyze-string>
</xsl:template>
</xsl:stylesheet>
...applied to this input...
<doc>
<string>/iTunes/iTunes%20Music/Droit%20devant/L'odysse%CC%81e.mp3</string>
<string>A%Cc%80%20la%20Pe%CC%82che</string>
<string>%D0%97%D0%B0%D0%BF%D0%BE%D0%BC%D0%B8%D0%BD%D0%B0%D0%B8%CC%86</string>
<string>%CE%9A%CE%BF%CC%81%CF%84%CF%83%CC%8C%CE%B1%CF%81%CE%B9</string>
</doc>
..yields..
<doc>
<string>/iTunes/iTunes Music/Droit devant/L'odyssée.mp3</string>
<string>À la Pêche</string>
<string>Запоминай</string>
<string>Κότσ̌αρι</string>
</doc>
Here's one option using the java.net.URLDecoder.decode
Java method, but you'll either have to upgrade to Saxon-PE (or EE) or downgrade to Saxon-B.
Saxon-B is free and is still an XSLT 2.0 processor. Both can be found here: http://saxon.sourceforge.net/
Example...
XML Input
<doc>
<string>/iTunes/iTunes%20Music/Droit%20devant/L'odysse%CC%81e.mp3</string>
<string>A%CC%80%20la%20Pe%CC%82che</string>
<string>%D0%97%D0%B0%D0%BF%D0%BE%D0%BC%D0%B8%D0%BD%D0%B0%D0%B8%CC%86</string>
<string>%CE%9A%CE%BF%CC%81%CF%84%CF%83%CC%8C%CE%B1%CF%81%CE%B9</string>
</doc>
XSLT 2.0 (tested with Saxon-PE 9.4 and Saxon-B 9.1)
<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:java-urldecode="java.net.URLDecoder">
<xsl:output method="xml" encoding="UTF-8" omit-xml-declaration="yes"/>
<xsl:strip-space elements="*"/>
<xsl:template match="string">
<xsl:value-of select="java-urldecode:decode(.,'UTF-8')"/>
<xsl:text>
</xsl:text>
</xsl:template>
</xsl:stylesheet>
Output
/iTunes/iTunes Music/Droit devant/L'odyssée.mp3
À la Pêche
Запоминай
Κότσ̌αρι
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With