爬虫18:练习之小说下载
2022-09-14 本文已影响0人
_百草_
# -*- coding:utf-8 -*-
"""
@author:百草Lily
@file:test_fiction.py
@time:2022/9/14
"""
import time
import random
from faker import Faker
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
from queue import Queue
class Fiction:
def __init__(self):
self.base_url = "https://www.shicimingju.com"
self.url = self.base_url+"/book/sanguoyanyi.html"
fake = Faker(locale="zh_CN")
self.headers = {
"user-agent": fake.user_agent()
}
self.q = Queue() # maxsize=3) # 存储url的队列;暂定前3回
# 获取请求
def get_request(self, url):
req = Request(url, headers=self.headers)
resp = urlopen(req)
return resp.read()
@staticmethod
def save_file(content):
with open("fiction.txt", "a", encoding="utf-8") as f:
f.write(content)
def get_urls(self):
html = self.get_request(self.url)
soup = BeautifulSoup(html, "lxml")
mulus = soup.select(".book-mulu a")
for mulu in mulus:
mulu_href = self.base_url + mulu["href"]
self.q.put(mulu_href)
print(f"self.q={self.q}")
# 解析子url,获取文章内容
def parse_html(self, url):
html = self.get_request(url).decode("utf8") # 编码
soup = BeautifulSoup(html, "lxml")
eles = soup.select("#main_left .card")
# <div id="main_left">
# <div class="card bookmark-list">
# <h1>章节名称</h1>
# <div class="chapter_content">章节内容</div>
# </div>
# <div class="book-page-nav">翻页</div>
# </div>
content = "".join([ele.get_text() for ele in eles]) # ele.text 也可以获取文本
# print(f"content={content}")
return content
def run(self):
self.get_urls()
while not self.q.empty():
url = self.q.get()
print(f"-----开始下载:{url}-----")
content = self.parse_html(url)
self.save_file(content)
time.sleep(random.uniform(1, 3)) # 避免请求频繁
print(f"-----结束下载:{url}-----")
if __name__ == "__main__":
fc = Fiction()
fc.run()
反思:
- 每次请求的时间间隔
time.sleep(random.uniform(1, 3)) # 避免请求频繁
- 文本保存异常
html = self.get_request(url).decode("utf8") # 编码