python爬取笔趣阁的热门小说并保存
2020-03-27 本文已影响0人
刘年
过程比较简单,但是思路一定要清晰
第一、获取该页的小说名和地址 第二、获取小说章节名和地址 第三、解析章节内容,获取文本
import requests
from lxml import etree
import time
base_url ='https://www.biquge5200.cc/xiuzhenxiaoshuo/'
headers={
'referer': 'https://www.biquge5200.cc/',
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Mobile Safari/537.36',
}
#获取热门小说页面的所有小说地址和名字
def many_novel_message():
novel_dict={}
main_sourse = requests.get(base_url,headers=headers).text
main_html = etree.HTML(main_sourse)
novel_urls =main_html.xpath('//div[@class="l"]//span[@class="s2"]//a/@href')
novel_titles =main_html.xpath('//div[@class="l"]//span[@class="s2"]//a/text()')
for index,novel_url in enumerate(novel_urls):
novel_dict[novel_titles[index]] = novel_url
return novel_dict
#获得小说的章节和地址
def chap_message(novel_url):
# novel_url ='https://www.biquge5200.cc/46_46254/'
chap_dict ={}
chap_sourse = requests.get(novel_url,headers=headers).text
chap_html = etree.HTML(chap_sourse)
# chap_urls =chap_html.xpath('//div[@class="box_con"]')[1].xpath('.//dl/dd/a/text()')[9:]
chap_titles =chap_html.xpath('//div[@class="box_con"][2]//dl/dd/a/text()')[9:]
chap_urls =chap_html.xpath('//div[@class="box_con"][2]//dl/dd/a/@href')[9:]
for index,chap_url in enumerate(chap_urls):
# chap_dict.update([(chap_titles[index],chap_url)])
chap_dict[chap_titles[index]]=chap_url
# print(chap_dict)
return chap_dict
#输入地址参数,获取小说的内容
def chap_cont(chap_url):
# chap_url ='https://www.biquge5200.cc/46_46254/17700048.html'
detail_sourse = requests.get(chap_url,headers=headers).text
detail_html = etree.HTML(detail_sourse)
chap_conts =detail_html.xpath('//div[@id="content"]/p/text()')
conts=[]
conts='\n'.join(chap_conts)
time.sleep(1)
return conts
if __name__ == '__main__':
#得到众多小说页的各小说名字和链接
novel_dict = many_novel_message()
#遍历每个小说和链接
for novel_title in novel_dict:
# ff=open('小说\\{0}.txt'.format(novel_title),'a',encoding='utf-8',errors='ignore')
ff =open('小说\\{0}.txt'.format(novel_title), 'a', encoding='utf-8', errors='ignore')
novel_url =novel_dict[novel_title]
#打开每个链接获取章节信息(名字和章节链接)
chap_dict =chap_message(novel_url)
for chap_title in chap_dict:
chap_url =chap_dict[chap_title]
#打开每个章节链接,获取内容
conts=chap_cont(chap_url)
# print(conts)
# with open('小说\\{0}.txt'.format(novel_title), 'a', encoding='utf-8', errors='ignore') as ff:
#
ff.write(chap_title+'\n'+conts+'\n')
ff.close()