My question is sort of like this question but I have more constraints:
Are there any tools set up to do this or am I better off just breaking out RegexBuddy and C#?
I'm open to command line or batch processing tools as well as C/C#/D libraries.
This code I hacked up today with HTML Agility Pack, will extract unformatted trimmed text.
public static string ExtractText(string html)
{
if (html == null)
{
throw new ArgumentNullException("html");
}
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
var chunks = new List<string>();
foreach (var item in doc.DocumentNode.DescendantNodesAndSelf())
{
if (item.NodeType == HtmlNodeType.Text)
{
if (item.InnerText.Trim() != "")
{
chunks.Add(item.InnerText.Trim());
}
}
}
return String.Join(" ", chunks);
}
If you want to maintain some level of formatting you can build on the sample provided with the source.
public string Convert(string path)
{
HtmlDocument doc = new HtmlDocument();
doc.Load(path);
StringWriter sw = new StringWriter();
ConvertTo(doc.DocumentNode, sw);
sw.Flush();
return sw.ToString();
}
public string ConvertHtml(string html)
{
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
StringWriter sw = new StringWriter();
ConvertTo(doc.DocumentNode, sw);
sw.Flush();
return sw.ToString();
}
public void ConvertTo(HtmlNode node, TextWriter outText)
{
string html;
switch (node.NodeType)
{
case HtmlNodeType.Comment:
// don't output comments
break;
case HtmlNodeType.Document:
ConvertContentTo(node, outText);
break;
case HtmlNodeType.Text:
// script and style must not be output
string parentName = node.ParentNode.Name;
if ((parentName == "script") || (parentName == "style"))
break;
// get text
html = ((HtmlTextNode) node).Text;
// is it in fact a special closing node output as text?
if (HtmlNode.IsOverlappedClosingElement(html))
break;
// check the text is meaningful and not a bunch of whitespaces
if (html.Trim().Length > 0)
{
outText.Write(HtmlEntity.DeEntitize(html));
}
break;
case HtmlNodeType.Element:
switch (node.Name)
{
case "p":
// treat paragraphs as crlf
outText.Write("\r\n");
break;
}
if (node.HasChildNodes)
{
ConvertContentTo(node, outText);
}
break;
}
}
private void ConvertContentTo(HtmlNode node, TextWriter outText)
{
foreach (HtmlNode subnode in node.ChildNodes)
{
ConvertTo(subnode, outText);
}
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With