In order to strip out tags we can use replace() function and can also use . textContent property, . innerText property from HTML DOM.
SQL Server TRIM() Function The TRIM() function removes the space character OR other specified characters from the start or end of a string. By default, the TRIM() function removes leading and trailing spaces from a string. Note: Also look at the LTRIM() and RTRIM() functions.
Strip_tags() is a function that allows you to strip out all HTML and PHP tags from a given string (parameter one), however you can also use parameter two to specify a list of HTML tags you want.
There is a UDF that will do that described here:
User Defined Function to Strip HTML
CREATE FUNCTION [dbo].[udf_StripHTML] (@HTMLText VARCHAR(MAX))
RETURNS VARCHAR(MAX) AS
BEGIN
DECLARE @Start INT
DECLARE @End INT
DECLARE @Length INT
SET @Start = CHARINDEX('<',@HTMLText)
SET @End = CHARINDEX('>',@HTMLText,CHARINDEX('<',@HTMLText))
SET @Length = (@End - @Start) + 1
WHILE @Start > 0 AND @End > 0 AND @Length > 0
BEGIN
SET @HTMLText = STUFF(@HTMLText,@Start,@Length,'')
SET @Start = CHARINDEX('<',@HTMLText)
SET @End = CHARINDEX('>',@HTMLText,CHARINDEX('<',@HTMLText))
SET @Length = (@End - @Start) + 1
END
RETURN LTRIM(RTRIM(@HTMLText))
END
GO
Edit: note this is for SQL Server 2005, but if you change the keyword MAX to something like 4000, it will work in SQL Server 2000 as well.
Derived from @Goner Doug answer, with a few things updated:
- using REPLACE where possible
- conversion of predefined entities like é
(I chose the ones I needed :-)
- some conversion of list tags <ul> and <li>
ALTER FUNCTION [dbo].[udf_StripHTML]
--by Patrick Honorez --- www.idevlop.com
--inspired by http://stackoverflow.com/questions/457701/best-way-to-strip-html-tags-from-a-string-in-sql-server/39253602#39253602
(
@HTMLText varchar(MAX)
)
RETURNS varchar(MAX)
AS
BEGIN
DECLARE @Start int
DECLARE @End int
DECLARE @Length int
set @HTMLText = replace(@htmlText, '<br>',CHAR(13) + CHAR(10))
set @HTMLText = replace(@htmlText, '<br/>',CHAR(13) + CHAR(10))
set @HTMLText = replace(@htmlText, '<br />',CHAR(13) + CHAR(10))
set @HTMLText = replace(@htmlText, '<li>','- ')
set @HTMLText = replace(@htmlText, '</li>',CHAR(13) + CHAR(10))
set @HTMLText = replace(@htmlText, '’' collate Latin1_General_CS_AS, '''' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '"' collate Latin1_General_CS_AS, '"' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '&' collate Latin1_General_CS_AS, '&' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '€' collate Latin1_General_CS_AS, '€' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '<' collate Latin1_General_CS_AS, '<' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '>' collate Latin1_General_CS_AS, '>' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'œ' collate Latin1_General_CS_AS, 'oe' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, ' ' collate Latin1_General_CS_AS, ' ' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '©' collate Latin1_General_CS_AS, '©' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '«' collate Latin1_General_CS_AS, '«' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '®' collate Latin1_General_CS_AS, '®' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '±' collate Latin1_General_CS_AS, '±' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '²' collate Latin1_General_CS_AS, '²' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '³' collate Latin1_General_CS_AS, '³' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'µ' collate Latin1_General_CS_AS, 'µ' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '·' collate Latin1_General_CS_AS, '·' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'º' collate Latin1_General_CS_AS, 'º' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '»' collate Latin1_General_CS_AS, '»' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '¼' collate Latin1_General_CS_AS, '¼' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '½' collate Latin1_General_CS_AS, '½' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '¾' collate Latin1_General_CS_AS, '¾' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '&Aelig' collate Latin1_General_CS_AS, 'Æ' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'Ç' collate Latin1_General_CS_AS, 'Ç' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'È' collate Latin1_General_CS_AS, 'È' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'É' collate Latin1_General_CS_AS, 'É' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'Ê' collate Latin1_General_CS_AS, 'Ê' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'Ö' collate Latin1_General_CS_AS, 'Ö' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'à' collate Latin1_General_CS_AS, 'à' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'â' collate Latin1_General_CS_AS, 'â' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'ä' collate Latin1_General_CS_AS, 'ä' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'æ' collate Latin1_General_CS_AS, 'æ' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'ç' collate Latin1_General_CS_AS, 'ç' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'è' collate Latin1_General_CS_AS, 'è' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'é' collate Latin1_General_CS_AS, 'é' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'ê' collate Latin1_General_CS_AS, 'ê' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'ë' collate Latin1_General_CS_AS, 'ë' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'î' collate Latin1_General_CS_AS, 'î' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'ô' collate Latin1_General_CS_AS, 'ô' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'ö' collate Latin1_General_CS_AS, 'ö' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '÷' collate Latin1_General_CS_AS, '÷' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'ø' collate Latin1_General_CS_AS, 'ø' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'ù' collate Latin1_General_CS_AS, 'ù' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'ú' collate Latin1_General_CS_AS, 'ú' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'û' collate Latin1_General_CS_AS, 'û' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, 'ü' collate Latin1_General_CS_AS, 'ü' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '"' collate Latin1_General_CS_AS, '"' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '&' collate Latin1_General_CS_AS, '&' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '‹' collate Latin1_General_CS_AS, '<' collate Latin1_General_CS_AS)
set @HTMLText = replace(@htmlText, '›' collate Latin1_General_CS_AS, '>' collate Latin1_General_CS_AS)
-- Remove anything between <STYLE> tags
SET @Start = CHARINDEX('<STYLE', @HTMLText)
SET @End = CHARINDEX('</STYLE>', @HTMLText, CHARINDEX('<', @HTMLText)) + 7
SET @Length = (@End - @Start) + 1
WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, '')
SET @Start = CHARINDEX('<STYLE', @HTMLText)
SET @End = CHARINDEX('</STYLE>', @HTMLText, CHARINDEX('</STYLE>', @HTMLText)) + 7
SET @Length = (@End - @Start) + 1
END
-- Remove anything between <whatever> tags
SET @Start = CHARINDEX('<', @HTMLText)
SET @End = CHARINDEX('>', @HTMLText, CHARINDEX('<', @HTMLText))
SET @Length = (@End - @Start) + 1
WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, '')
SET @Start = CHARINDEX('<', @HTMLText)
SET @End = CHARINDEX('>', @HTMLText, CHARINDEX('<', @HTMLText))
SET @Length = (@End - @Start) + 1
END
RETURN LTRIM(RTRIM(@HTMLText))
END
If your HTML is well formed, I think this is a better solution:
create function dbo.StripHTML( @text varchar(max) ) returns varchar(max) as
begin
declare @textXML xml
declare @result varchar(max)
set @textXML = REPLACE( @text, '&', '' );
with doc(contents) as
(
select chunks.chunk.query('.') from @textXML.nodes('/') as chunks(chunk)
)
select @result = contents.value('.', 'varchar(max)') from doc
return @result
end
go
select dbo.StripHTML('This <i>is</i> an <b>html</b> test')
Here's an updated version of this function that incorporates the RedFilter answer (Pinal's original) with the LazyCoders additions and the goodeye typo corrections AND my own addition to handle in-line <STYLE>
tags inside the HTML.
ALTER FUNCTION [dbo].[udf_StripHTML]
(
@HTMLText varchar(MAX)
)
RETURNS varchar(MAX)
AS
BEGIN
DECLARE @Start int
DECLARE @End int
DECLARE @Length int
-- Replace the HTML entity & with the '&' character (this needs to be done first, as
-- '&' might be double encoded as '&amp;')
SET @Start = CHARINDEX('&', @HTMLText)
SET @End = @Start + 4
SET @Length = (@End - @Start) + 1
WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, '&')
SET @Start = CHARINDEX('&', @HTMLText)
SET @End = @Start + 4
SET @Length = (@End - @Start) + 1
END
-- Replace the HTML entity < with the '<' character
SET @Start = CHARINDEX('<', @HTMLText)
SET @End = @Start + 3
SET @Length = (@End - @Start) + 1
WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, '<')
SET @Start = CHARINDEX('<', @HTMLText)
SET @End = @Start + 3
SET @Length = (@End - @Start) + 1
END
-- Replace the HTML entity > with the '>' character
SET @Start = CHARINDEX('>', @HTMLText)
SET @End = @Start + 3
SET @Length = (@End - @Start) + 1
WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, '>')
SET @Start = CHARINDEX('>', @HTMLText)
SET @End = @Start + 3
SET @Length = (@End - @Start) + 1
END
-- Replace the HTML entity & with the '&' character
SET @Start = CHARINDEX('&amp;', @HTMLText)
SET @End = @Start + 4
SET @Length = (@End - @Start) + 1
WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, '&')
SET @Start = CHARINDEX('&amp;', @HTMLText)
SET @End = @Start + 4
SET @Length = (@End - @Start) + 1
END
-- Replace the HTML entity with the ' ' character
SET @Start = CHARINDEX(' ', @HTMLText)
SET @End = @Start + 5
SET @Length = (@End - @Start) + 1
WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, ' ')
SET @Start = CHARINDEX(' ', @HTMLText)
SET @End = @Start + 5
SET @Length = (@End - @Start) + 1
END
-- Replace any <br> tags with a newline
SET @Start = CHARINDEX('<br>', @HTMLText)
SET @End = @Start + 3
SET @Length = (@End - @Start) + 1
WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, CHAR(13) + CHAR(10))
SET @Start = CHARINDEX('<br>', @HTMLText)
SET @End = @Start + 3
SET @Length = (@End - @Start) + 1
END
-- Replace any <br/> tags with a newline
SET @Start = CHARINDEX('<br/>', @HTMLText)
SET @End = @Start + 4
SET @Length = (@End - @Start) + 1
WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, CHAR(13) + CHAR(10))
SET @Start = CHARINDEX('<br/>', @HTMLText)
SET @End = @Start + 4
SET @Length = (@End - @Start) + 1
END
-- Replace any <br /> tags with a newline
SET @Start = CHARINDEX('<br />', @HTMLText)
SET @End = @Start + 5
SET @Length = (@End - @Start) + 1
WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, CHAR(13) + CHAR(10))
SET @Start = CHARINDEX('<br />', @HTMLText)
SET @End = @Start + 5
SET @Length = (@End - @Start) + 1
END
-- Remove anything between <STYLE> tags
SET @Start = CHARINDEX('<STYLE', @HTMLText)
SET @End = CHARINDEX('</STYLE>', @HTMLText, CHARINDEX('<', @HTMLText)) + 7
SET @Length = (@End - @Start) + 1
WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, '')
SET @Start = CHARINDEX('<STYLE', @HTMLText)
SET @End = CHARINDEX('</STYLE>', @HTMLText, CHARINDEX('</STYLE>', @HTMLText)) + 7
SET @Length = (@End - @Start) + 1
END
-- Remove anything between <whatever> tags
SET @Start = CHARINDEX('<', @HTMLText)
SET @End = CHARINDEX('>', @HTMLText, CHARINDEX('<', @HTMLText))
SET @Length = (@End - @Start) + 1
WHILE (@Start > 0 AND @End > 0 AND @Length > 0) BEGIN
SET @HTMLText = STUFF(@HTMLText, @Start, @Length, '')
SET @Start = CHARINDEX('<', @HTMLText)
SET @End = CHARINDEX('>', @HTMLText, CHARINDEX('<', @HTMLText))
SET @Length = (@End - @Start) + 1
END
RETURN LTRIM(RTRIM(@HTMLText))
END
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With