2019-08-26 第一个爬虫案例

2019-08-26  本文已影响0人  王自然_4e0b

#第一步请求网页数据

import requests

from bs4 import BeautifulSoup

#请求数据

url = 'https://book.douban.com/latest'

headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"}

data = requests.get(url,headers =headers)

print(data.text)

#解析数据

soup = BeautifulSoup(data.text,'lxml')

print(soup)

#检查元素观察网页

#观察到网页上的书籍按照两边分布,按照标签进行提取

import bs4

books_left = soup.find('ul',{'class':'cover-col-4 clearfix'})

books_left = books_left.find_all('li')

books_right = soup.find('ul',{'class':'cover-col-4 pl20 clearfix'})

books_right = books_right.find_all('li')

books = list(books_left) + list(books_right)

#对每一个图书区块进行相同的操作,获取图书信息

img_urls = []

titles = []

ratings = []

authors = []

#details = []

for book in books:

    #图片封面的url地址

    img_url = book.find_all('a')[0].find('img').get('src')

    img_urls.append(img_url)

    #图书标题

    title = book.find_all('a')[1].get_text()

    titles.append(title)

    #评价星级

    rating = book.find('p',{'class':'rating'}).get_text()

    rating = rating.replace('\n','').replace(' ','')

    ratings.append(rating)

    #作者及出版信息

    author = book.find('p',{'class':'color-gray'}).get_text()

    author = author.replace('\n','').replace(' ','')

    authors.append(author)

print('done')

print(img_urls)

print(titles)

print(ratings)

print(authors)

#数据导出

import pandas as pd

result =pd.DataFrame()

result['img_urls'] = img_urls

result['titles'] = titles

result['ratings'] = ratings

result['authors'] = authors

result.to_excel('result.xls',index = None)

上一篇下一篇

猜你喜欢

热点阅读