Python3爬取小说章节2020-09-24

2020-09-25  本文已影响0人  番茄晓蛋
#coding=utf-8
import urllib
import re
import os
import glob
import urllib.request


from urllib.parse import urljoin
from urllib.parse import urlparse

def cleanhtml(raw_html):
  cleanr = re.compile('<.*?>')
  cleantext = re.sub(cleanr, '', raw_html)
  return cleantext


def fetch_section(webroot):
    for page in range(8,9):
        print('正在下载第'+str(page)+'页小说')

        url = 'http://www.cangshubao.net/forum-915-'+str(page)+'.html'
        headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'  }
        try:
            request = urllib.request.Request(url,headers=headers)
            response = urllib.request.urlopen(request,timeout=180)
            #print (response.read().decode('gbk'))
        except urllib.error.URLError as e:
            if hasattr(e,"code"):
                print(e.code)
            if hasattr(e,"reason"):
                print(e.reason)

        # html = response.read().decode('utf-8')
        html = response.read().decode('gbk')
        #print html
        # pattern = re.compile(u'<li>.*?<div class="s">.*?target="_blank">(.*?)</a><br />大小:(.*?)<br>.*?</em><br>更新:(.*?)</div>.*?<a href="(.*?)"><img.*?>(.*?)</a>.*?<div class="u">(.*?)</div>',re.S)
        # pattern = re.compile(u'<a href="thread-1453325-1-3.html">第205章行内人的密报</a>')
        #  pattern = re.compile(u'<span id="(.*?)"><a href="(.*?)"></a></span>')
        pattern = re.compile(u'<span id="(.*?)"><a href="(.*?)">第(.*?)</a></span>')
        items = re.findall(pattern,html)
        # print (items)

        for item in items:
            try:
                book_thread = item[0].encode('gbk')
                book_link = item[1]
                book_name = item[2]
                book_full_link = urljoin(webroot, book_link)   # 构建书的绝对地址

                #请求地址
                try:
                    request = urllib.request.Request(book_full_link,headers=headers)
                    response =urllib.request.urlopen(request,timeout=180)
                except urllib.error.URLError as e:
                    if hasattr(e,"code"):
                        print(e.code)
                    if hasattr(e,"reason"):
                        print(e.reason)
                html = response.read().decode('gbk')
                
                pattern = re.compile('<table cellspacing="0" cellpadding="0"><tr><td class="t_msgfont" id="(.*?)"><font size="5">(.*?)</font></td></tr></table>',re.S)
                section_content = re.findall(pattern,html)

                # down txt
                try:
                    fp = open(book_name+'.txt','w')
                except IOError as e:
                    pattern = re.compile('<font size="5">(.*?)</font><br />',re.S)
                    book_name = re.findall(pattern,book_name)
                    fp = open(book_name[0]+'.txt','w')
                print('start download')
                cleanTxt = cleanhtml(str(section_content[0][1]))
                fp.write(cleanTxt)
                print('down finish\n')
                fp.close()
            except Exception as e:
                print('该条目解析出现错误,忽略')
                print(e)
                print('')
                fp = open('error.log','a')
                fp.write('page:'+str(page)+'\n')
                fp.write(item[4].encode('gbk'))
                fp.write('\nThere is an error in parsing process.\n\n')
                fp.close()

def merge_txt_files():
    read_files = glob.glob("*.txt")
    # print (read_files)
    with open("result.txt", "wb") as outfile:
        for f in read_files:
            
            with open(f, "rb") as infile:
                print(f)
                outfile.write(f.encode('gbk'))
                outfile.write(infile.read())

def main():
    # 获取排行榜首页内容
    webroot = 'http://www.cangshubao.net/'
    # fetch_section(webroot)
    merge_txt_files()

if __name__ == '__main__':
    main()
上一篇下一篇

猜你喜欢

热点阅读