Python3爬取小说章节2020-09-24
2020-09-25 本文已影响0人
番茄晓蛋
#coding=utf-8
import urllib
import re
import os
import glob
import urllib.request
from urllib.parse import urljoin
from urllib.parse import urlparse
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', raw_html)
return cleantext
def fetch_section(webroot):
for page in range(8,9):
print('正在下载第'+str(page)+'页小说')
url = 'http://www.cangshubao.net/forum-915-'+str(page)+'.html'
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' }
try:
request = urllib.request.Request(url,headers=headers)
response = urllib.request.urlopen(request,timeout=180)
#print (response.read().decode('gbk'))
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
# html = response.read().decode('utf-8')
html = response.read().decode('gbk')
#print html
# pattern = re.compile(u'<li>.*?<div class="s">.*?target="_blank">(.*?)</a><br />大小:(.*?)<br>.*?</em><br>更新:(.*?)</div>.*?<a href="(.*?)"><img.*?>(.*?)</a>.*?<div class="u">(.*?)</div>',re.S)
# pattern = re.compile(u'<a href="thread-1453325-1-3.html">第205章行内人的密报</a>')
# pattern = re.compile(u'<span id="(.*?)"><a href="(.*?)"></a></span>')
pattern = re.compile(u'<span id="(.*?)"><a href="(.*?)">第(.*?)</a></span>')
items = re.findall(pattern,html)
# print (items)
for item in items:
try:
book_thread = item[0].encode('gbk')
book_link = item[1]
book_name = item[2]
book_full_link = urljoin(webroot, book_link) # 构建书的绝对地址
#请求地址
try:
request = urllib.request.Request(book_full_link,headers=headers)
response =urllib.request.urlopen(request,timeout=180)
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
html = response.read().decode('gbk')
pattern = re.compile('<table cellspacing="0" cellpadding="0"><tr><td class="t_msgfont" id="(.*?)"><font size="5">(.*?)</font></td></tr></table>',re.S)
section_content = re.findall(pattern,html)
# down txt
try:
fp = open(book_name+'.txt','w')
except IOError as e:
pattern = re.compile('<font size="5">(.*?)</font><br />',re.S)
book_name = re.findall(pattern,book_name)
fp = open(book_name[0]+'.txt','w')
print('start download')
cleanTxt = cleanhtml(str(section_content[0][1]))
fp.write(cleanTxt)
print('down finish\n')
fp.close()
except Exception as e:
print('该条目解析出现错误,忽略')
print(e)
print('')
fp = open('error.log','a')
fp.write('page:'+str(page)+'\n')
fp.write(item[4].encode('gbk'))
fp.write('\nThere is an error in parsing process.\n\n')
fp.close()
def merge_txt_files():
read_files = glob.glob("*.txt")
# print (read_files)
with open("result.txt", "wb") as outfile:
for f in read_files:
with open(f, "rb") as infile:
print(f)
outfile.write(f.encode('gbk'))
outfile.write(infile.read())
def main():
# 获取排行榜首页内容
webroot = 'http://www.cangshubao.net/'
# fetch_section(webroot)
merge_txt_files()
if __name__ == '__main__':
main()