Skip to content

Commit

Permalink
Add a new option for MaxCrawlDepth
Browse files Browse the repository at this point in the history
  • Loading branch information
Stefan Broenner committed Dec 7, 2023
1 parent 5a7d14a commit 1e8a219
Show file tree
Hide file tree
Showing 5 changed files with 19 additions and 21 deletions.
16 changes: 2 additions & 14 deletions AzureSearchCrawler/AzureSearchCrawler.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -23,26 +23,14 @@
<Copyright>Copyright © 2017</Copyright>
<AssemblyVersion>1.0.0.0</AssemblyVersion>
<FileVersion>1.0.0.0</FileVersion>
<StartupObject>AzureSearchCrawler.CrawlerMain</StartupObject>
</PropertyGroup>
<ItemGroup>
<BootstrapperPackage Include=".NETFramework,Version=v4.5.2">
<Visible>False</Visible>
<ProductName>Microsoft .NET Framework 4.5.2 %28x86 and x64%29</ProductName>
<Install>true</Install>
</BootstrapperPackage>
<BootstrapperPackage Include="Microsoft.Net.Framework.3.5.SP1">
<Visible>False</Visible>
<ProductName>.NET Framework 3.5 SP1</ProductName>
<Install>false</Install>
</BootstrapperPackage>
</ItemGroup>
<ItemGroup>
<PackageReference Include="Abot" Version="2.0.70" />
<PackageReference Include="AngleSharp" Version="1.0.7" />
<PackageReference Include="Azure.Search.Documents" Version="11.5.1" />
<PackageReference Include="FluentCommandLineParser" Version="1.4.3" />
<PackageReference Include="HtmlAgilityPack" Version="1.11.54" />
<PackageReference Include="log4net" Version="2.0.15" />
<PackageReference Include="Newtonsoft.Json" Version="13.0.3" />

</ItemGroup>
</Project>
1 change: 1 addition & 0 deletions AzureSearchCrawler/AzureSearchIndexer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ public AzureSearchIndexer(string serviceEndPoint, string indexName, string admin
public async Task PageCrawledAsync(CrawledPage crawledPage)
{
string text = _textExtractor.ExtractText(crawledPage.Content.Text);

if (text == null)
{
Console.WriteLine("No content for page {0}", crawledPage?.Uri.AbsoluteUri);
Expand Down
10 changes: 6 additions & 4 deletions AzureSearchCrawler/Crawler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ public Crawler(CrawlHandler handler)
_handler = handler;
}

public async Task Crawl(string rootUri, int maxPages)
public async Task Crawl(string rootUri, int maxPages, int maxDepth)
{
PoliteWebCrawler crawler = new(CreateCrawlConfiguration(maxPages), null, null, null, null, null, null, null, null);
PoliteWebCrawler crawler = new(CreateCrawlConfiguration(maxPages, maxDepth), null, null, null, null, null, null, null, null);

crawler.PageCrawlStarting += crawler_ProcessPageCrawlStarting;
crawler.PageCrawlCompleted += crawler_ProcessPageCrawlCompleted;
Expand Down Expand Up @@ -70,7 +70,7 @@ async void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedAr
await _handler.PageCrawledAsync(crawledPage);
}

private CrawlConfiguration CreateCrawlConfiguration(int maxPages)
private CrawlConfiguration CreateCrawlConfiguration(int maxPages, int maxDepth)
{
ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls12;

Expand All @@ -81,7 +81,9 @@ private CrawlConfiguration CreateCrawlConfiguration(int maxPages)
MinCrawlDelayPerDomainMilliSeconds = 100,
IsSslCertificateValidationEnabled = true,
MaxPagesToCrawl = maxPages,

MaxCrawlDepth = maxDepth,
UserAgentString = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"

};

return crawlConfig;
Expand Down
11 changes: 9 additions & 2 deletions AzureSearchCrawler/CrawlerMain.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,16 @@ namespace AzureSearchCrawler
class CrawlerMain
{
private const int DefaultMaxPagesToIndex = 100;
private const int DefaultMaxCrawlDepth = 10;

private class Arguments
{
public string RootUri { get; set; }

public int MaxPagesToIndex { get; set; }

public int MaxCrawlDepth { get; set; }

public string ServiceEndPoint { get; set; }

public string IndexName { get; set; }
Expand All @@ -37,6 +40,11 @@ static void Main(string[] args)
.SetDefault(DefaultMaxPagesToIndex)
.WithDescription("Stop after having indexed this many pages. Default is " + DefaultMaxPagesToIndex + "; 0 means no limit.");

p.Setup(arg => arg.MaxCrawlDepth)
.As('d', "maxDepth")
.SetDefault(DefaultMaxCrawlDepth)
.WithDescription("Maximum crawl depth. Default is " + DefaultMaxCrawlDepth);

p.Setup(arg => arg.ServiceEndPoint)
.As('s', "ServiceEndPoint")
.Required()
Expand Down Expand Up @@ -70,9 +78,8 @@ static void Main(string[] args)

var indexer = new AzureSearchIndexer(arguments.ServiceEndPoint, arguments.IndexName, arguments.AdminApiKey, new TextExtractor());
var crawler = new Crawler(indexer);
crawler.Crawl(arguments.RootUri, maxPages: arguments.MaxPagesToIndex).Wait();
crawler.Crawl(arguments.RootUri, maxPages: arguments.MaxPagesToIndex, maxDepth: arguments.MaxCrawlDepth).Wait();

Console.Read(); // keep console open until a button is pressed so we see the output
}
}
}
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ This project is intended as a demo or a starting point for a real crawler. At a

- Create an Azure Search search service. If you're new to Azure Search, follow [this guide](https://docs.microsoft.com/en-us/azure/search/search-create-service-portal).
- Create an index in your search service with three string fields: "id", "url", and "content". Make them searchable.
- Run CrawlerMain, either from Visual Studio after opening the .sln file, or from the command line after compiling using msbuild. You will need to pass a few command-line arguments, such as your search service information and the root URL of the site you'd like to crawl. Calling the program without arguments or with -h will list the arguments.
- Run CrawlerMain, either from Visual Studio after opening the .sln file, or from the command line after compiling using msbuild. You will need to pass a few command-line arguments, such as your search service endpoint information (Url) and the root URL of the site you'd like to crawl. Calling the program without arguments or with -h will list the arguments.


# Howto: customize it for your project
Expand Down

0 comments on commit 1e8a219

Please sign in to comment.