爬取电影并存为excel
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import lxml
douban = 'https://movie.douban.com/top250'
name=[]#film name
quote=[]#film star
score=[]#film score
def parseHtml(html):
soup = BeautifulSoup(html,'lxml')
movie_list_soup = soup.find('ol', attrs={'class': 'grid_view'})
for movie_li in movie_list_soup.find_all('li'):
movie_name = movie_li.find('span', attrs={'class': 'title'}).getText()
movie_star = movie_li.find('span', attrs={'class': 'rating_num'}).getText()
movieQuote = movie_li.find('span', attrs={'class': 'inq'}).getText()
print('{0} {1} {2}'.format(movie_name, movie_star, movieQuote))
name.append(movie_name)
score.append(movie_star)
quote.append(movieQuote)
nextPage = soup.find('span', attrs={'class': 'next'}).find('a')
if nextPage:
download(douban + '{0}'.format(nextPage['href']))
else:
print('all is ok')
def download(url):
print(url)
content = requests.get(url).content
parseHtml(content)
download(douban)
df = pd.DataFrame({'title':name,'rate':quote,'pingyu':score}
data={'title':name,'rate':quote,'pingyu':score}
df.to_excel('foo.xlsx', sheet_name='Sheet1')