爬取 笔趣网 小说

2018-05-01  本文已影响23人  52_St
import os

import requests
from lxml import etree

'''
下载网站 www.biquw.com 的小说
'''


class BiquwNovel:

    # 初始化
    def __init__(self, url):
        self.__url = url

    # 定制的下载网页并解析的解析器
    def __parse(self, url):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36'
        }
        try:
            self.__html = requests.get(url, headers=headers).text
        except Exception as e:
            print(e)
            return False
        self.__tree = etree.HTML(self.__html)

    # 下载并保存小说到本地,并保存到指定路径
    def download(self, file_path):
        # 获取网页发生异常,直接结束
        detect = self.__parse(self.__url)
        if detect is False:
            return
        # 提取书名
        book_name = self.__tree.xpath('//h1/text()')[0]
        # 提取 章节链接
        chapter_links = self.__tree.xpath('//div[@class="book_list"]/ul//li/a/@href')
        # 以书的名字建立存储目录
        file_path = os.path.join(file_path, book_name)
        if not os.path.exists(file_path):
            os.makedirs(file_path)
        print('开始下载...{}'.format(book_name.strip()))
        # 循环下载所有章节的内容
        for link in chapter_links:
            detect = self.__parse(self.__url + link)
            if detect is False:
                continue
            chapter_name = self.__tree.xpath('//h1/text()')[0]
            content = self.__tree.xpath('//div[@id="htmlContent"]/text()')
            # 以章节名字存储已下载的每一章节的内容
            self.__process_text(file_path, chapter_name, content)
            print(chapter_name.strip() + '...下载完成!')
            # return

    # 存储下载的章节内容的函数
    @staticmethod
    def __process_text(file_path, chapter_name, content):
        with open(os.path.join(file_path, '{}.txt'.format(chapter_name)), 'w') as f:
            for text in content:
                text = text.strip()
                if text:
                    f.write(text + '\n\n')


# 根据URL创建一个实例对象
book = BiquwNovel('http://www.biquw.com/book/900/')
# 调用download方法下载并保存小说
book.download('E:\Python\python_work')
QQ图片20180501185605.png
上一篇下一篇

猜你喜欢

热点阅读