Python 爬虫,爬取小说内容,按章节保存txt文件

2023-06-07  本文已影响0人  我的小小笔尖
#coding=utf-8

import requests

from bs4 import BeautifulSoup
import lxml

import os
import time


# 请求
def requestByUrl(url):
    resp = requests.get(url)
    time.sleep(1) # 休眠,避免访问太频繁
    if(resp.status_code==200):
        # 成功,返回
        return resp
    else:
        None


# 获取章节清单
def getChapterList(url):
    resp = requestByUrl(url)
    content = resp.content
    soup = BeautifulSoup(content, 'lxml')
    chapter_list = soup.select('div > dl > dd > a')
    return chapter_list


# 获取章节正文
def getChapterContent(url):
    resp = requestByUrl(url)
    content = resp.content
    soup = BeautifulSoup(content, 'lxml')
    chapter_content = soup.select('#content')[0].get_text("\n", '<br>')
    return chapter_content


# 保存为本地文本文件
def saveChapterContent2TxtFile(filePath, fileName, fileContent):
    txtFile = os.path.join(filePath, fileName)
    print (time.strftime('%Y-%m-%d %H-%M-%S'), 'Create Txt File.', txtFile)
    with open(txtFile, 'w') as file:
        file.write(fileContent)
    file.close()


# 主函数
def main():
    # 基本信息
    novel_title = '深空彼岸'
    novel_author = '辰东'
    novle_base_url = 'https://www.hongyue8.net/'
    novel_index_url = 'https://www.hongyue8.net/68/68276/'

    # 判断小说目录是否存在
    novel_save_path = os.path.join('./', novel_author, novel_title)
    if os.path.exists(novel_save_path):
        print ('Folder Exists.')
    else:
        print ('Folder Not Exists. Create Folder.', novel_save_path)
        os.makedirs(novel_save_path)

    # 获取章节清单
    chapter_list = getChapterList(novel_index_url)
    for i, chapter in enumerate(chapter_list):
        # 章节链接
        chapter_href = novle_base_url + chapter['href']
        # 章节标题
        chapter_title = chapter.getText()
        # 文件名
        fileName = chapter_title + '.txt'
        # 章节文本
        chapter_content = getChapterContent(chapter_href)
        # 保存为本地文本文件
        saveChapterContent2TxtFile(novel_save_path, fileName, chapter_content)
        # 断开循环,只保存第一章节内容
        break


if __name__ == '__main__':
    main()
上一篇 下一篇

猜你喜欢

热点阅读