Python 爬虫,爬取小说内容,按章节保存txt文件
2023-06-07 本文已影响0人
我的小小笔尖
#coding=utf-8
import requests
from bs4 import BeautifulSoup
import lxml
import os
import time
# 请求
def requestByUrl(url):
resp = requests.get(url)
time.sleep(1) # 休眠,避免访问太频繁
if(resp.status_code==200):
# 成功,返回
return resp
else:
None
# 获取章节清单
def getChapterList(url):
resp = requestByUrl(url)
content = resp.content
soup = BeautifulSoup(content, 'lxml')
chapter_list = soup.select('div > dl > dd > a')
return chapter_list
# 获取章节正文
def getChapterContent(url):
resp = requestByUrl(url)
content = resp.content
soup = BeautifulSoup(content, 'lxml')
chapter_content = soup.select('#content')[0].get_text("\n", '<br>')
return chapter_content
# 保存为本地文本文件
def saveChapterContent2TxtFile(filePath, fileName, fileContent):
txtFile = os.path.join(filePath, fileName)
print (time.strftime('%Y-%m-%d %H-%M-%S'), 'Create Txt File.', txtFile)
with open(txtFile, 'w') as file:
file.write(fileContent)
file.close()
# 主函数
def main():
# 基本信息
novel_title = '深空彼岸'
novel_author = '辰东'
novle_base_url = 'https://www.hongyue8.net/'
novel_index_url = 'https://www.hongyue8.net/68/68276/'
# 判断小说目录是否存在
novel_save_path = os.path.join('./', novel_author, novel_title)
if os.path.exists(novel_save_path):
print ('Folder Exists.')
else:
print ('Folder Not Exists. Create Folder.', novel_save_path)
os.makedirs(novel_save_path)
# 获取章节清单
chapter_list = getChapterList(novel_index_url)
for i, chapter in enumerate(chapter_list):
# 章节链接
chapter_href = novle_base_url + chapter['href']
# 章节标题
chapter_title = chapter.getText()
# 文件名
fileName = chapter_title + '.txt'
# 章节文本
chapter_content = getChapterContent(chapter_href)
# 保存为本地文本文件
saveChapterContent2TxtFile(novel_save_path, fileName, chapter_content)
# 断开循环,只保存第一章节内容
break
if __name__ == '__main__':
main()