网络爬虫:urllib模块应用8--猫眼
2018-12-23 本文已影响0人
牛耀
# 1: 分析网站,找到目标url,判断是否是静态页面
# https://maoyan.com/board/4?offset=0
# https://maoyan.com/board/4?offset=10
from urllib import parse,request
import re,pymysql
def maoyanSpider(url):
"""
根据url请求,解析数据,构造下一次请求
:param url: 分页url地址
:return:
"""
html,current_url = load_page_data(url)
# 解析数据
movies = parse_page_data(html)
if len(movies)>0:
for movie in movies:
movie_data = {}
# 排名
movie_data['rank'] = int(movie[0])
# 封面图片
movie_data['coverImage'] = movie[1]
# 电影名称
movie_data['name'] = movie[2]
# 主演
movie_data['actor'] = movie[3].replace('\n','').replace(' ','')
# 时间
movie_data['publishTime'] = movie[4].replace('上映时间','')
# 评分数
movie_data['scorenum'] = float(movie[5]+movie[6])
# 存
# save_data_to_db(movie_data)
print(movie_data)
# 构造下一页
pattern = re.compile('.*?offset=(\d+)')
current_offset = int(re.findall(pattern,current_url)[0])
nextpage_offset = current_offset+10
# next_url = 'https://maoyan.com/board/4?offset='+str(nextpage_offset)
# 通过正则替换
pattern = re.compile('offset=\d+')
next_url = re.sub(pattern,'offset='+str(nextpage_offset),current_url)
maoyanSpider(next_url)
else:
print('结束')
def load_page_data(url):
req_header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Referer': 'https://www.lagou.com/jobs/list_c%2B%2B?labelWords=&fromSearch=true&suginput=',
}
# 构建一个request对象
req = request.Request(url, headers=req_header)
# 根据Request对象发起请求
response = request.urlopen(req)
if response.status == 200:
# decode 解码
return response.read().decode('utf-8'),response.url
def parse_page_data(html):
"""
从页面源码中提取目标数据
:param html:
:return:
"""
pattern = re.compile('<dd>.*?<i.*?>(.*?)</i>'+
'.*?<img.*?data-src="(.*?)"'+
'.*?<p.*?>.*?<a.*?>(.*?)</a>'+
'.*?<p.*?>(.*?)</p>'+
'.*?<p.*?>(.*?)</p>'+
'.*?<i.*?>(.*?)</i>'+
'.*?<i.*?>(.*?)</i>.*?</dd>',re.S
)
result = re.findall(pattern,html)
return result
def save_data_to_db(movieInfo):
"""
存储数据
:param movieInfo:
:return:
"""
pass
if __name__ == '__main__':
mysql_client = pymysql.Connect('127.0.0.1', 'root', '18603503110', '1712B', 3306, charset='utf8')
# 创建游标(执行mysql语句)
cursor = mysql_client.cursor()
start_url = 'https://maoyan.com/board/4?offset=0'
maoyanSpider(start_url)