小说爬虫

2018-08-14  本文已影响0人  淡然z
from urllib import request
from bs4 import BeautifulSoup

if __name__ == '__main__':
    url = 'https://www.biquge.info/11_11668/'
    head = {}
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1'}
    req = request.Request(url, headers=headers)
    response = request.urlopen(req)
    html = response.read()
    soup = BeautifulSoup(html, 'lxml')
    list = soup.find('div', id='list').find_all('dd')
    f = open('imgs' + '/' + '黄金瞳.txt', 'wb')
    for li in list:
        title = li.find('a')['title']
        print(title)
        txt_url = li.find('a')['href']
        download_req = request.Request('https://www.biquge.info/11_11668/'+txt_url, headers=headers)
        download_response = request.urlopen(download_req)
        download_html = download_response.read()
        download_soup = BeautifulSoup(download_html, 'lxml')
        download_soup_texts = download_soup.find('div', id='content')
        download_soup_texts = download_soup_texts.text
        f.write(bytes(title, 'utf-8') )
        f.write(bytes(download_soup_texts,'utf-8'))
        f.write(bytes('\n\n','utf-8'))
    f.close()
上一篇下一篇

猜你喜欢

热点阅读