Python

python requests 爬小说

2017-05-03  本文已影响188人  粗识名姓

遇到了编码错误真的很蛋疼,卡了2个小时才解决!
参考文献1参考文献2参考文献3

网站示例一:
# -*- coding: utf-8 -*-

import requests, re
from bs4 import BeautifulSoup

content='http://www.8shuw.com/BookReader/24-24559.html'  #目录页
resp = requests.get(content)
resp.encoding = 'gbk'
soup=BeautifulSoup(resp.text,'lxml')
tbody = soup.find('table',{'border':'0','class':'acss'}).find('tbody')
trs = tbody.find_all('a',{'itemprop':'url','href': True})
trs = [tr for tr in trs if re.match(u'^第.*$',tr.text) != None]  #排除一些没用的章节
#print('Count:',len(trs))
#print(trs[-1].text,'href =',trs[-1].get('href'))

with open ('novel.txt', 'w') as f:
    for chapter in reversed(trs):
        f.write(chapter.text+'\n')
        resp = requests.get(chapter.get('href'))
        resp.encoding = 'gbk'
        soup=BeautifulSoup(resp.text,'lxml')
        texts = soup.find('div',{'id':'readtext','class':'fontm'}).find_all('p')
        print(trs.index(chapter),chapter.text)
        for line in texts:
            #解决错误关键点,encode后再decode,加上ignore参数忽略一些解码错误
            f.write(re.sub(r'CNZZ_SLOT_RENDER\(\"\d{3,8}\"\)\;','',line.text.encode('gb18030').decode('gbk','ignore'))+'\n') 
    f.close()
网站示例二:
import requests, re
from bs4 import BeautifulSoup

content='http://www.piaotian.com/html/5/5896/'

resp = requests.get(content)
resp.encoding = 'gbk'
soup=BeautifulSoup(resp.text,'lxml')
tbody = soup.find('div',{'class':'centent'})
trs = tbody.find_all('a',{'href': True})
trs = [tr for tr in trs if re.match(u'^第.*$',tr.text) != None]
print('Count:',len(trs))
print(trs[1980].text,'href =',trs[1980].get('href'))

#print(re.sub(r'CNZZ_SLOT_RENDER\(\"\d{3,8}\"\)\;','',texts[1].text))
with open ('novel.txt', 'w') as f:
    for chapter in trs[1980:]: # 倒序目录 reversed(trs):
        print(trs.index(chapter),chapter.text)
        f.write(chapter.text+'\n')  # 章节标题
        resp = requests.get(content + chapter.get('href'))
        #resp.encoding = 'gb18030'
        soup=BeautifulSoup(resp.text,'html.parser')  #这里解析器不同于前例
        texts = soup.find_all('br')
        #print(soup.get_text())
        for line in texts:
            if len(line.text)>0:
                f.write(line.text.encode('utf-8').decode('gbk','ignore')+'\n')
    f.close()
上一篇 下一篇

猜你喜欢

热点阅读