2019-08-26 第一个爬虫案例
#第一步请求网页数据
import requests
from bs4 import BeautifulSoup
#请求数据
url = 'https://book.douban.com/latest'
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"}
data = requests.get(url,headers =headers)
print(data.text)
#解析数据
soup = BeautifulSoup(data.text,'lxml')
print(soup)
#检查元素观察网页
#观察到网页上的书籍按照两边分布,按照标签进行提取
import bs4
books_left = soup.find('ul',{'class':'cover-col-4 clearfix'})
books_left = books_left.find_all('li')
books_right = soup.find('ul',{'class':'cover-col-4 pl20 clearfix'})
books_right = books_right.find_all('li')
books = list(books_left) + list(books_right)
#对每一个图书区块进行相同的操作,获取图书信息
img_urls = []
titles = []
ratings = []
authors = []
#details = []
for book in books:
#图片封面的url地址
img_url = book.find_all('a')[0].find('img').get('src')
img_urls.append(img_url)
#图书标题
title = book.find_all('a')[1].get_text()
titles.append(title)
#评价星级
rating = book.find('p',{'class':'rating'}).get_text()
rating = rating.replace('\n','').replace(' ','')
ratings.append(rating)
#作者及出版信息
author = book.find('p',{'class':'color-gray'}).get_text()
author = author.replace('\n','').replace(' ','')
authors.append(author)
print('done')
print(img_urls)
print(titles)
print(ratings)
print(authors)
#数据导出
import pandas as pd
result =pd.DataFrame()
result['img_urls'] = img_urls
result['titles'] = titles
result['ratings'] = ratings
result['authors'] = authors
result.to_excel('result.xls',index = None)