个人专题工具癖大数据 爬虫Python AI Sql

用Python在豆瓣上找书看

2017-01-17  本文已影响331人  dalalaa

我有时候会上豆瓣上看书评,一般是通过这个标签页面来找:

Paste_Image.png

但是这个页面不像淘宝,没有筛选功能,所以用打算用爬虫爬下来自己筛选。
我主要爬取了这几个信息:标题、评分、阅读人数、页数、出版日期和价格。这些是我看书比较关注的东西。爬取下来的数据我选择直接存入pandas中的DataFrame来进行筛选。
下面是代码

\#-*- coding: UTF-8 -*-
from bs4 import BeautifulSoup
import urllib
import  re
import pandas as pd
\#豆瓣这个页面没有反爬虫,所以不需要伪装成浏览器。
def findTag(url):
    source_code = urllib.request.urlopen(url)
    soup = BeautifulSoup(source_code,"html.parser")
    return soup
def findTitle(soup):
    titles = []
    titletag = soup.findAll('h2',{'class':True})
    for title in titletag:
        t = title.get_text()
        t = re.sub('\n','',t)
        t = re.sub(' ','',t)
        titles.append(t)
    return (titles)
def findRating(soup):
    ratin = soup.findAll('div',{'class':'star clearfix'})
    rating = []
    for item in ratin:
        try:
            r = item.find('span',{'class':'rating_nums'}).get_text()#有可能没有rating
            r = float(r)
        except:
            r = 0.0
        rating.append(r)
    return rating
def findPopularity(soup):
    popularity = []
    popu = soup.findAll('div',{'class':'star clearfix'})
    for item in popu:
        p = item.find('span',{'class':'pl'}).get_text()
        p = re.sub('\n','',p)
        p = re.sub(' ','',p)
        p = re.sub('人评价\)','',p)
        p = re.sub('\(','',p)
        p = re.sub('少于','',p)
        p = float(p)
        popularity.append(p)
    return popularity  
def findInfor(soup):#这里注意,整个爬虫中最耗时的是urlopen()函数,尽量少用,能合并就合并
    thickness = []
    year = []
    price = []
    thick = soup.findAll('h2',{'class':True})
    for item in thick:
        href = item.find('a').attrs['href']
        soup1 = BeautifulSoup(urllib.request.urlopen(href),"html.parser")
        thickne = soup1.find('span',text = re.compile('页数')).next_sibling
        thickness.append(thickne)
        yea = soup1.find('span',text = re.compile('出版年')).next_sibling
        year.append(yea)
        pric = soup1.find('span',text = re.compile('定价')).next_sibling
        price.append(pric)
    infor = [thickness,year,price]
    return infor
def switchPages(keyword):
    book_title_list = []
    rating_list = []
    popularity_list = []
    thickness_list = []
    year_list = []
    price_list = []
    for i in range(1):
        page = "https://book.douban.com/tag/"+urllib.parse.quote(keyword)+"?start="+str(20*i)+"&type=T"
        a1 = findTag(page)
        b1 = findTitle(a1)
        book_title_list.extend(b1)
        b3 = findRating(a1)
        rating_list.extend(b3)
        b4 = findPopularity(a1)
        popularity_list.extend(b4)
        b5 = findInfor(a1)
        thickness_list.extend(b5[0])
        year_list.extend(b5[1])
        price_list.extend(b5[2])
 print(len(book_title_list),len(rating_list),len(popularity_list),len(thickness_list),len(year_list),len(price_list)) 
    df = pd.DataFrame({'Title':book_title_list,'rating':rating_list,'popularity':popularity_list,'thickness':thickness_list,'year':year_list,'price':price_list})
    print(df)

switchPages("编程")

上一篇 下一篇

猜你喜欢

热点阅读