Python爬取豆瓣图书250

2020-02-24  本文已影响0人  李白开水

使用了requests+beautifulsoup爬取,并发邮件给自己

#!/usr/bin/env python
# encoding: utf-8
import requests
from bs4 import BeautifulSoup
import sys
import yagmail

reload(sys)
sys.setdefaultencoding('utf8')

"""
 获取豆瓣图书 Top 250
"""


# # 获得指定开始排行的图书url
def get_url(root_url, n):
    url1 = root_url + str(n * 25)
    return url1


def get_review(page_url):
    books_list = []
    response = requests.get(page_url)
    soup = BeautifulSoup(response.text, 'lxml')
    # soup = soup.find('div', 'indent')
    table = soup.findAll('table', {"width": "100%"})
    for item in table:
        name = item.div.a.text.strip()
        r_name = name.replace('\n', '').replace(' ', '')
        tmp2 = item.div.span  # 判断是否存在别名
        if tmp2:
            name2 = tmp2.text.strip().replace(':', '')
        else:
            name2 = r_name
        info = item.find('p', {"class": "pl"}).text
        score = item.find('span', {"class": "rating_nums"}).text.strip()
        books_list.append((r_name, name2, info, score))
    return books_list


def send_mail():
    yag = yagmail.SMTP(user='bb@qq.com', password='abcdefg', host='smtp.qq.com')
    yag.send(to='bb@qq.com', subject="豆瓣图书 Top 250", contents="豆瓣图书 Top 250",attachments = ["D:\\top250_books.txt"])
    print "Send already"


def main():
    for n in range(10):
        root_url = "https://book.douban.com/top250?start=0"
        books_url = get_url(root_url, n)
        books_list = get_review(books_url)
        with open('D:\\top250_books.txt', 'a') as f:
            for books_dict in books_list:
                book_info = "图书名称:" + books_dict[0] + "\t" + "图书别名:" + books_dict[1] + "\t" + "作者及出版信息:" + books_dict[
                    2] + "\t" + "评分:" + books_dict[3] + "\t"
                f.write(book_info)
                # print book_info
    send_mail()


if __name__ == "__main__":
    main()
上一篇下一篇

猜你喜欢

热点阅读