Python抓取小说
2016-11-10 本文已影响0人
爱要趁早
缘由
唐家三少《龙王传说》写了比较多了,刚好看到了浏览器里有推荐,于是就稍稍看看,然而,总是感觉页面广告太多了,体验太差,干脆,用Python整理一下好了。
环境
windows,Python2.x,requests,lxml
代码
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
import requests
def getHtml(url,headers=None):
r = requests.get(url,headers=headers)
return r.content
def useXpath(html):
from lxml import etree
html = etree.HTML(html)
#print type(html)
mulu = []
urls_text = html.xpath('//*[@id="list"]/dl/dd/a/text()')
urls = html.xpath('//*[@id="list"]/dl/dd/a/@href')
headers = {
'Referer':'http://www.aiquxs.com/read/41/41742/index.html',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'
}
with open('d://xiaoshu8888.txt','a') as f:
for x in range(len(urls)):
url = 'http://www.aiquxs.com/read/41/41742/' + urls[x]
print u'正在抓取 ',urls_text[x],u' 网址为: ' + url
f.write(urls_text[x]+'\n') # 将章节名写入文件
html = getHtml(url,headers) # 获取章节内容的源码
html = etree.HTML(html)
text = html.xpath('//*[@id="booktext"]/text()')
for item in text:
f.write(item+'\n')
if __name__ == '__main__':
#目录 url
url = 'http://www.aiquxs.com/read/41/41742/index.html'
html = getHtml(url)
useXpath(html)
运行图片
结束语
喜欢的话,欢迎关注、打赏,收藏,谢谢!