【python爬虫】第十六次 xpath整站抓取阳光电影网电影资
2018-01-19 本文已影响10人
急躁的假汉子
一、解析电影url
请求15题构造出的每个电影菜单的分页url,解析出每个电影url
二、xpath解析电影资源
对第一题解析出来的电影url进行请求,解析出最终的电影名称 电影下载地址
import requests
from lxml import etree
root_url = 'http://www.ygdy8.com'
#请求阳光电影网站
req = requests.get(root_url)
#输出请求的状态码
status_code = req.status_code
#print(status_code)
#输出网页源码
req.encoding = 'gb2312'
html = req.text
selector = etree.HTML(html)
infos = selector.xpath('//div[@class="contain"]/ul/li[position()<10]/a')
for info in infos:
info_text = info.xpath('text()')[0]
if info_text == '经典影片':
continue
info_url = root_url + info.xpath('@href')[0]
#将阳光电影网首页导航栏前9个菜单url抓取,输出结果为可以正常访问的url, 并过滤掉"经典影片"的菜单url
#print(info_text,info_url)
req1 = requests.get(info_url)
req1.encoding = 'gb2312'
html1 = req1.text
selector1 = etree.HTML(html1)
page = selector1.xpath('//div[@class="co_content8"]/div[@class="x"]//text()')[1].split('/')[0].replace('共','').replace('页','').strip()
page = int(page)
page_list = selector1.xpath('//div[@class="co_content8"]/div[@class="x"]//a/@href')[0].replace('2.html','')
url_list = []
for i in range(1,page+1):
page_url = info_url.replace('index.html','')+page_list+str(i)+'.html'
url_list.append(page_url)
#print(info_text,'共'+str(page)+'页',url_list)
for page_url in url_list:
req2 = requests.get(page_url)
req2.encoding ='gb2312'
html2 = req2.text
selector2 = etree.HTML(html2)
movie_part_urls = selector2.xpath('//div[@class="co_content8"]/ul//a/@href')
for movie_part_url in movie_part_urls:
movie_url = root_url + movie_part_url
req3 = requests.get(movie_url)
req3.encoding = 'gb2312'
html3 = req3.text
selector3 = etree.HTML(html3)
movie_name = selector3.xpath('//div[@class="co_area2"]//h1/font/text()')[0]
movie_download_url = selector3.xpath('//div[@class="co_content8"]//table//a/@href')[0].split('/[')[0]
print(movie_name,movie_download_url)
结果如下
image.png三、对代码进行分装成函数
import requests
from lxml import etree
from multiprocessing import Pool
#获取阳光电影菜单栏
def get_menu_url(url):
# 请求阳光电影网站
req = requests.get(url)
# 输出请求的状态码
status_code = req.status_code
# print(status_code)
# 输出网页源码
req.encoding = 'gb2312'
html = req.text
selector = etree.HTML(html)
infos = selector.xpath('//div[@class="contain"]/ul/li[position()<10]/a')
info_urls = []
for info in infos:
info_text = info.xpath('text()')[0]
if info_text == '经典影片':
continue
info_url = root_url + info.xpath('@href')[0]
info_urls.append(info_url)
return info_urls
def get_page_url(urls):
url_list = []
for url in urls:
req1 = requests.get(url)
req1.encoding = 'gb2312'
html1 = req1.text
selector1 = etree.HTML(html1)
page = selector1.xpath('//div[@class="co_content8"]/div[@class="x"]//text()')[1].split('/')[0].replace('共',
'').replace(
'页', '').strip()
page = int(page)
page_list = selector1.xpath('//div[@class="co_content8"]/div[@class="x"]//a/@href')[0].replace('2.html', '')
for i in range(1, page + 1):
page_url = url.replace('index.html', '') + page_list + str(i) + '.html'
url_list.append(page_url)
return url_list
def get_movie_url(url):
root_url = 'http://www.ygdy8.com'
req2 = requests.get(url)
req2.encoding = 'gb2312'
html2 = req2.text
selector2 = etree.HTML(html2)
movie_part_urls = selector2.xpath('//div[@class="co_content8"]/ul//a/@href')
for movie_part_url in movie_part_urls:
movie_url = root_url + movie_part_url
req3 = requests.get(movie_url)
req3.encoding = 'gb2312'
html3 = req3.text
selector3 = etree.HTML(html3)
movie_name = selector3.xpath('//div[@class="co_area2"]//h1/font/text()')[0]
movie_download_url = selector3.xpath('//div[@class="co_content8"]//table//a/@href')[0]
print(movie_name, movie_download_url)
if __name__ == '__main__':
root_url = 'http://www.ygdy8.com'
##获取阳光电影菜单栏网址
menu_urls = get_menu_url(root_url)
##获取菜单栏下各分页的网址
page_urls = get_page_url(menu_urls)
##获取电影以及电影网址
p = Pool(4)
p.map(get_movie_url, page_urls)