DotnetSpider爬虫图文日记--小说下载（C#，dotn

2018-07-11 本文已影响178人 SpiderMe

项目简介：

需求：爬取网页所有的“玄幻”类型小说。

系统：Windows10

工具：Visual Studio Code

环境：.NET Core 2.1

爬虫库：DotnetSpider

开始之前：

下载书的部分，用到了Python，因为我不知道如何在C#中操作。谢谢！

一：创建项目

在指定目录，创建 DotnetCore Console 程序，并使用 VSCode（Visual Studio Code 后面都使用简称）打开此项目。

# 右下角，提示选择 YES。

添加 DotnetSpider 的库。点击：视图--终端，或者按 CTRL+(TAB上面那个键)，输入：dotnet add package DotnetSpider.Core --version 2.6.0-beta5。安装完成后，会提示，还原项目，点击。

二：编辑 Program.cs

using System;

using System.Collections.Generic;

using System.IO;

using System.Text.RegularExpressions;

using DotnetSpider.Core;

using DotnetSpider.Core.Downloader;

using DotnetSpider.Core.Pipeline;

using DotnetSpider.Core.Processor;

using DotnetSpider.Core.Scheduler;

using DotnetSpider.Core.Selector;

namespace SpiderFiction

{

class Program

{

static void Main(string[] args)

{

CostomPageProcessorAndPipeline();

Console.WriteLine("下载完毕！！！按任意键退出！！！");

Console.Read();

}

private static void CostomPageProcessorAndPipeline()

{

// 玄幻小说的首页网址

string url = "http://www.jjxsw.com/e/action/ListInfo.php?page=0&classid=11&line=10&tempid=3&ph=1&andor=and&qujian=4&orderby=2&myorder=0&totalnum=150";

Site site = new Site { CycleRetryTimes = 3, SleepTime = 300 };

site.AddStartUrl(url);

// 使用 PageProcessor和 Scheduler创建一个爬行器。添加数据处理管道

Spider spider = Spider.Create(site, new QueueDuplicateRemovedScheduler(), new FictionPageProcessor()).AddPipeline(new FictionPipeline());

// 爬虫下载器

spider.Downloader = new HttpClientDownloader();

// 爬虫的线程数

spider.ThreadNum = 4;

// 当没有其它链接请求时，爬虫的退出时间

spider.EmptySleepTime = 2000;

// 启动爬虫

spider.Run();

}

///

/// 对获取到的网页，进行解析，解析到的数据，将传到 Pipeline 中，进行处理

///

internal class FictionPageProcessor:BasePageProcessor

{

protected override void Handle(Page page)

{

// 获取当前页面中，所有的小说：标签的列表

var totalFictionElements = page.Selectable.XPath("//div[@class=\"listbg\"]/a[1]").Nodes();

// 创建小说类，列表。

List results = new List();

string reurl = "";

foreach (var fictionElement in totalFictionElements)

{

var fiction = new Fiction();

// 得到小说页面的链接

reurl = fictionElement.Select(Selectors.XPath("@href")).GetValue();

// 通过 ReURL 将小说页面的链接，替换为下载页面的链接

fiction.Url = ReURL(reurl);

results.Add(fiction);

}

// 小说保存到页面的结果中

page.AddResultItem("FictionResult", results);

// 实现翻面效果，获取后面的页面

foreach (var url in page.Selectable.XPath("//div[@class=\"pager\"]/ul").Links().Nodes())

{

// 得到的链接包含多余的字符，将其替换掉

string u = replace(url.GetValue());

// 将解析到的后续页面，添加到目标请求中，这将会遍历所有的页面

page.AddTargetRequest(new Request(u, null));

}

///

/// 替换多余的字符

///

/// 需要修改的字符串

/// 可以正常访问的链接

private string replace(string v) => v.Replace("amp;", "");

///

/// 修改 URL

///

/// 小说页面的 URL 例：http://www.jjxsw.com/txt/26697.htm

/// 小说下载页面的 URL 例：http://www.jjxsw.com/txt/dl-11-26697.html

private string ReURL(string reurl)

{

string url = reurl + "l"; // 获取到的链接少了一个 "L"

string pattern = "\\d+";

string replace = "dl-11-" + Path.GetFileNameWithoutExtension(url);

return Regex.Replace(url, pattern, replace);

}

///

/// 小说对象

///

internal class Fiction

{

///

/// 保存小说下载地址的 URL

///

public string Url{get;set;}

}

///

/// 对 PageProcessor 解析到的页面数据，进行处理

///

internal class FictionPipeline : BasePipeline

{

///

/// 保存所有小说，下载页面的 URL

/// 例：例：http://www.jjxsw.com/txt/dl-11-26697.html

///

List urlList = new List();

public override void Process(IEnumerable resultItems, ISpider spider)

{

foreach (var resultItem in resultItems)

{

foreach (Fiction entry in resultItem.Results["FictionResult"])

{

// 所有的小说，下载页面的 URL

urlList.Add(entry.Url);

}

// 下载小说类，得到所有小说的下载链接 URL

DownFiction df = new DownFiction(urlList);

df.Run();

}

三：添加 DownFiction 类

using DotnetSpider.Core;

using DotnetSpider.Core.Downloader;

using DotnetSpider.Core.Pipeline;

using DotnetSpider.Core.Processor;

using DotnetSpider.Core.Scheduler;

using System;

using System.Collections.Generic;

using System.IO;

namespace SpiderFiction

{

///

/// 获取所有小说的下载链接

/// 例如：http://www.jjxsw.com/e/DownSys/doaction.php?enews=DownSoft&classid=11&id=26697&pathid=0&pass=ee247a67a5adcf1dfb1abecbd1ff5635&p=:::

///

class DownFiction

{

///

/// 小说下载页面的连接

/// 例：http://www.jjxsw.com/txt/dl-11-26697.html

///

private List urlList;

public DownFiction(List urlList)

{

this.urlList = urlList;

}

///

/// 页面解析：获取小说下载连接，并写入本地文件

///

internal void Run()

{

XpathFiction(urlList);

DownFictionPipeline df = new DownFictionPipeline();

// 没有直接下载，而是存入文件，因为我不知道，如何在 C# 中，解析那样的网址

df.WriteToFile();

}

///

/// 创建站点信息、爬虫

///

private void XpathFiction(List urlList)

{

Site site = new Site { CycleRetryTimes = 3, SleepTime = 300 };

site.AddStartUrls(urlList);

Spider spider = Spider.Create(site, new QueueDuplicateRemovedScheduler(), new DownFictionPageProcessor()).AddPipeline(new DownFictionPipeline());

spider.Downloader = new HttpClientDownloader();

spider.ThreadNum = 4;

spider.EmptySleepTime = 2000;

spider.Run();

}

///

/// 解析数据

///

internal class DownFictionPageProcessor:BasePageProcessor

{

public DownFictionPageProcessor()

{

}

protected override void Handle(Page page)

{

// 小说的下载链接：http://www.jjxsw.com/e/DownSys/doaction.php?enews=DownSoft&classid=11&id=26697&pathid=0&pass=ee247a67a5adcf1dfb1abecbd1ff5635&p=:::

page.AddResultItem("url", page.Selectable.XPath("//a[@class=\"strong green\"][1]/@href").GetValue());

}

internal class DownFictionPipeline : BasePipeline

{

private static List urls = new List();

// 本地文本的位置

private readonly string path = @"D:\ASP.NET Core\Book\URL.txt";

public override void Process(IEnumerable resultItems, ISpider spider)

{

foreach (var result in resultItems)

{

// 将所有的链接，保存到 List 中

urls.Add(result.Results["url"] as string);

}

public void WriteToFile()

{

try

{

// 保存所有的链接到本地

File.WriteAllLines(path, urls);

}

catch (Exception ee)

{

using (StreamWriter sw = new StreamWriter(@"D:\ASP.NET CORE\Book\log.txt",true))

{

sw.WriteLine(ee.Message);

}

点击：视图--终端，或者按 CTRL+(TAB上面那个键)，输入：dotnet run，执行程序。

程序会将，所有的小说链接，下载到本地。

由于我不知道，如何在 C# 中使用这些小说的下载链接，所以保存到了本地。

接下来，我将使用 Python ，解析这些链接，并下载小说保存到本地。

四：新建一个 Python 文件

文件正在下载，保存。

如果有人看到此文章，并有其它想法，欢迎指教，谢谢。