Skip to content

Commit

Permalink
代码优化
Browse files Browse the repository at this point in the history
  • Loading branch information
keroroqingwa committed May 5, 2020
1 parent 5f562d2 commit e85b205
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 19 deletions.
28 changes: 9 additions & 19 deletions src/CatBookApp.Application/BookSearches/Captures/BiqugeCapture.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
namespace CatBookApp.BookSearches.Captures
{
/// <summary>
/// 抓取来源:笔趣阁
/// 抓取来源:笔趣阁 https://www.xsbiquge.com
/// </summary>
public class BiqugeCapture : IBookCapture
{
Expand All @@ -23,34 +23,23 @@ public List<BookInfoDto> GetBooks(string q, int pn)
{
HtmlDocument doc = new HtmlDocument();
string url = string.Empty;
//这里请求三次是因为。。。 调试过就知道,你就当做是错误重试吧 (′゜ω。‵)
//这里请求两次是因为。。。 调试过就知道,你就当做是错误重试吧 (′゜ω。‵)
try
{
try
{
//no.1
url = $"https://www.xxbiquge.com/search.php?keyword={q}&page={pn}&p={pn - 1}";
url = $"https://www.xsbiquge.com/search.php?keyword={q}&page={pn}&p={pn - 1}";
HtmlWeb webClient = new HtmlWeb();
doc = webClient.Load(url);
}
catch
{
try
{
//no.2
url = $"http://zhannei.baidu.com/cse/search?s=8823758711381329060&q={q}&page={pn}&p={pn - 1}";
Thread.Sleep(1000 * 1);
var html = Utils.HttpHelper.Get(url);
doc.LoadHtml(html);
}
catch
{
//no.3
url = $"http://zhannei.baidu.com/cse/search?s=3654077655350271938&q={q}&page={pn}&p={pn - 1}";
Thread.Sleep(1000 * 2);
HtmlWeb webClient = new HtmlWeb();
doc = webClient.Load(url);
}
//no.2
url = $"https://www.xxbiquge.com/search.php?keyword={q}&page={pn}&p={pn - 1}";
Thread.Sleep(1000 * 1);
var html = Utils.HttpHelper.Get(url);
doc.LoadHtml(html);
}
}
catch (Exception ex)
Expand Down Expand Up @@ -136,6 +125,7 @@ public BookChapterDto GetBookChapters(string bookLink)
var bookChapter = new BookChapterDto()
{
BookName = doc.DocumentNode.SelectSingleNode("//div[@id='info']/h1").InnerText.Trim(),
BookLink = bookLink,
Author = nodes[0].InnerText.Replace(nodes[0].InnerText.Split(':')[0] + ":", string.Empty).Trim(),
Status = nodes[1].InnerText.Replace(nodes[1].InnerText.Split(':')[0] + ":", string.Empty).Replace(",加入书架,直达底部", string.Empty),
Last_Update_Time = nodes[2].InnerText.Replace(nodes[2].InnerText.Split(':')[0] + ":", string.Empty),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,50 @@ public static string ClearSensitiveCharacter(string input)
input = input.Replace(item.ToString(), "");
}
//
input = HtmlToTxt(input);

return input;
}

/// <summary>
/// 对字符串进行检查和替换其中的特殊字符
/// </summary>
/// <param name="strHtml"></param>
/// <returns></returns>
public static string HtmlToTxt(string strHtml)
{
string[] aryReg ={
@"<script[^>]*?>.*?</script>",
@"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",
//@"([\r\n])[\s]+",
@"&(quot|#34);",
@"&(amp|#38);",
@"&(lt|#60);",
@"&(gt|#62);",
@"&(nbsp|#160);",
@"&(iexcl|#161);",
@"&(cent|#162);",
@"&(pound|#163);",
@"&(copy|#169);",
@"&#(\d+);",
@"-->",
@"<!--.*\n"
};

//string newReg = aryReg[0];
string strOutput = strHtml;
for (int i = 0; i < aryReg.Length; i++)
{
Regex regex = new Regex(aryReg[i], RegexOptions.IgnoreCase);
strOutput = regex.Replace(strOutput, string.Empty);
}

strOutput.Replace("<", "");
strOutput.Replace(">", "");
strOutput.Replace("\r\n", "");


return strOutput;
}
}
}

0 comments on commit e85b205

Please sign in to comment.