爬取豆瓣前250电影资料
2018-12-15 本文已影响0人
超人不会飞_9664
需要用到两大模块
- requests
- BeautifulSoup
requests
- 安装requests
pip install requests
- 导入requests
import requests
BeautifulSoup
- 安装BeautifulSoup
pip install BeautifulSoup
或
pipenv install BeautifulSoup
- 导入BeautifulSoup
import bs4
或
from bs4 import BeautifulSoup
步入正题爬取信息
import requests
from bs4 import BeautifulSoup
def url_open(url):
# 使用代理
# proxies = {"http": "127.0.0.1:1080", "https": "127.0.0.1:1080"}
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
res = requests.get(url,headers = headers)
return res
def find_depth(res):
soup = BeautifulSoup(res.text,'html.parser')
depth= soup.find('span',class_ = 'next').previous_sibling.previous_sibling.text
return int(depth)
def find_movies(res):
soup = BeautifulSoup(res.text,'html.parser')
# 电影名
movies = []
targets = soup.find_all('div',class_ = 'hd')
for i in targets:
movies.append(i.a.span.text)
# 评分
ranks = []
targets = soup.find_all('span', class_='rating_num')
for i in targets:
ranks.append('评分:%s' % i.text)
# 资料
messages = []
targets = soup.find_all('div', class_='bd')
for i in targets:
try:
messages.append(i.p.text.split('\n')[1].strip()+i.p.text.split('\n')[2].strip())
except:
continue
results = []
length = len(movies)
for i in range(length):
results.append(movies[i]+ranks[i]+messages[i]+'\n')
return results
def main():
host = 'https://movie.douban.com/top250'
res = url_open(host)
depth = find_depth(res)
result = []
for i in range(depth):
url = host + '?start='+str(25 * i)+'&filter='
res = url_open(url)
result.extend(find_movies(res))
with open('250movies.txt','w',encoding= 'utf-8') as f:
f.writelines(result)
if __name__ == '__main__':
main()