DotnetSpider, a .NET Standard web crawling library similar to WebMagic and Scrapy. It is a lightweight ,efficient and fast high-level web crawling & scraping framework for .NET
- Visual Studio 2017(15.3 or later)
- .NET Core 2.0 or later
-
Storage data to mysql. Download MySql
grant all on *.* to 'root'@'localhost' IDENTIFIED BY '' with grant option; flush privileges;
-
Run distributed crawler. Download Redis for windows
-
SqlServer.
-
PostgreSQL.
-
MongoDb
-
Cassandra
https://github.com/dotnetcore/DotnetSpider/wiki
Please see the Projet DotnetSpider.Sample in the solution.
[TaskName("JdSkuSampleSpider")]
public class JdSkuSampleSpider : EntitySpider
{
public JdSkuSampleSpider() : base("JdSkuSample", new Site
{
})
{
}
protected override void MyInit(params string[] arguments)
{
Identity = Identity ?? "JD SKU SAMPLE";
// storage data to mysql, default is mysql entity pipeline, so you can comment this line. Don't miss sslmode.
AddPipeline(new MySqlEntityPipeline("Database='mysql';Data Source=localhost;User ID=root;Password=;Port=3306;SslMode=None;"));
AddStartUrl("http://list.jd.com/list.html?cat=9987,653,655&page=2&JL=6_0_0&ms=5#J_main", new Dictionary<string, object> { { "name", "手机" }, { "cat3", "655" } });
AddEntityType<Product>();
}
}
[EntityTable("test", "jd_sku", EntityTable.Monday, Indexs = new[] { "Category" }, Uniques = new[] { "Category,Sku", "Sku" })]
[EntitySelector(Expression = "//li[@class='gl-item']/div[contains(@class,'j-sku-item')]")]
[TargetUrlsSelector(XPaths = new[] { "//span[@class=\"p-num\"]" }, Patterns = new[] { @"&page=[0-9]+&" })]
public class Product : SpiderEntity
{
[PropertyDefine(Expression = "./@data-sku", Length = 100)]
public string Sku { get; set; }
[PropertyDefine(Expression = "name", Type = SelectorType.Enviroment, Length = 100)]
public string Category { get; set; }
[PropertyDefine(Expression = "cat3", Type = SelectorType.Enviroment)]
public int CategoryId { get; set; }
[PropertyDefine(Expression = "./div[1]/a/@href")]
public string Url { get; set; }
[PropertyDefine(Expression = "./div[5]/strong/a")]
public long CommentsCount { get; set; }
[PropertyDefine(Expression = ".//div[@class='p-shop']/@data-shop_name", Length = 100)]
public string ShopName { get; set; }
[PropertyDefine(Expression = "0", Type = SelectorType.Enviroment)]
public int ShopId { get; set; }
[PropertyDefine(Expression = ".//div[@class='p-name']/a/em", Length = 100)]
public string Name { get; set; }
[PropertyDefine(Expression = "./@venderid", Length = 100)]
public string VenderId { get; set; }
[PropertyDefine(Expression = "./@jdzy_shop_id", Length = 100)]
public string JdzyShopId { get; set; }
[PropertyDefine(Expression = "Monday", Type = SelectorType.Enviroment)]
public DateTime RunId { get; set; }
}
public static void Main()
{
Startup.Run(new string[] { "-s:JdSkuSampleSpider", "-tid:JdSkuSampleSpider", "-i:guid" });
}
Command: -s:[spider type name | TaskName attribute] -i:[identity] -a:[arg1,arg2...] -tid:[taskId] -n:[name] -c:[configuration file path]
- -s: Type name of spider or TaskNameAttribute for example: DotnetSpider.Sample.BaiduSearchSpiderl
- -i: Set identity.
- -a: Pass arguments to spider's Run method.
- -tid: Set task id.
- -n: Set name.
- -c: Set config file path, for example you want to run with a customize config: -e:app.my.config
When you want to collect a page JS loaded, there is only one thing to do, set the downloader to WebDriverDownloader.
Downloader=new WebDriverDownloader(Browser.Chrome);
NOTE:
- Make sure there is a ChromeDriver.exe in bin forlder when you try to use Chrome. You can contain it to your project via NUGET manager: Chromium.ChromeDriver
- Make sure you already add a *.webdriver Firefox profile when you try to use Firefox: https://support.mozilla.org/en-US/kb/profile-manager-create-and-remove-firefox-profiles
- Make sure there is a PhantomJS.exe in bin folder when you try to use PhantomJS. You can contain it to your project via NUGET manager: PhantomJS
- Set SystemConnection in app.config
- Update nlog.config like https://github.com/dotnetcore/DotnetSpider/blob/master/src/DotnetSpider.Extension.Test/nlog.config
https://github.com/zlzforever/DotnetSpider.Enterprise
- Dependences a ci platform forexample i used gitlab-ci right now.
- Dependences Sceduler.NET https://github.com/zlzforever/Scheduler.NET
- More documents continue...
timeout 0
tcp-keepalive 60
- EntitSpider定义的表名和列名全部小写化, 以备不同数据库间转换或者MYSQL win/linux的切换
- 允许不添加Pipeline执行爬虫
QQ Group: 477731655 Email: [email protected]