Skip to content

Commit

Permalink
Refactor TextExtractor a bit, normalize whitespace.
Browse files Browse the repository at this point in the history
  • Loading branch information
Thomas Kappler committed Apr 14, 2017
1 parent ec1afb5 commit 45ab0ca
Showing 1 changed file with 10 additions and 4 deletions.
14 changes: 10 additions & 4 deletions AzureSearchCrawler/TextExtractor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,24 +8,30 @@ namespace AzureSearchCrawler
{
/// <summary>
/// Extracts text content from a web page. The default implementation is very simple: it removes all script, style,
/// svg, and path tags, and then returns the InnerText of the page body.
/// svg, and path tags, and then returns the InnerText of the page body, with cleaned up whitespace.
/// <para/>You can implement your own custom text extraction by overriding the ExtractText method. The protected
/// helper methods in this class might be useful.
/// helper methods in this class might be useful. GetCleanedUpTextForXpath is the easiest way to get started.
/// </summary>
public class TextExtractor
{
private readonly Regex newlines = new Regex(@"(\r\n|\n)+");
private readonly Regex spaces = new Regex(@"[ \t]+");

public virtual string ExtractText(HtmlDocument doc)
{
return GetCleanedUpTextForXpath(doc, "//body");
}

public string GetCleanedUpTextForXpath(HtmlDocument doc, string xpath)
{
if (doc == null || doc.DocumentNode == null)
{
return null;
}

RemoveNodesOfType(doc, "script", "style", "svg", "path");
string content = ExtractTextFromFirstMatchingElement(doc, "//body");

string content = ExtractTextFromFirstMatchingElement(doc, xpath);
return NormalizeWhitespace(content);
}

Expand All @@ -49,7 +55,7 @@ protected void RemoveNodesOfType(HtmlDocument doc, params string[] types)
protected void RemoveNodes(HtmlDocument doc, string xpath)
{
var nodes = SafeSelectNodes(doc, xpath).ToList();
Console.WriteLine("Removing {0} nodes matching {1}.", nodes.Count, xpath);
// Console.WriteLine("Removing {0} nodes matching {1}.", nodes.Count, xpath);
foreach (var node in nodes)
{
node.Remove();
Expand Down

0 comments on commit 45ab0ca

Please sign in to comment.