python爬虫入门看这个就够了Python五期爬虫作业

【python爬虫】第十六次 xpath整站抓取阳光电影网电影资

2018-01-19  本文已影响10人  急躁的假汉子

一、解析电影url

请求15题构造出的每个电影菜单的分页url,解析出每个电影url

二、xpath解析电影资源

对第一题解析出来的电影url进行请求,解析出最终的电影名称 电影下载地址

import requests
from lxml import etree
root_url = 'http://www.ygdy8.com'
#请求阳光电影网站
req = requests.get(root_url)
#输出请求的状态码
status_code = req.status_code
#print(status_code)
#输出网页源码
req.encoding = 'gb2312'
html = req.text
selector = etree.HTML(html)
infos = selector.xpath('//div[@class="contain"]/ul/li[position()<10]/a')
for info in infos:
    info_text = info.xpath('text()')[0]
    if info_text == '经典影片':
        continue
    info_url = root_url + info.xpath('@href')[0]
    #将阳光电影网首页导航栏前9个菜单url抓取,输出结果为可以正常访问的url, 并过滤掉"经典影片"的菜单url
    #print(info_text,info_url)
    req1 = requests.get(info_url)
    req1.encoding = 'gb2312'
    html1 = req1.text
    selector1 = etree.HTML(html1)
    page = selector1.xpath('//div[@class="co_content8"]/div[@class="x"]//text()')[1].split('/')[0].replace('共','').replace('页','').strip()
    page = int(page)
    page_list = selector1.xpath('//div[@class="co_content8"]/div[@class="x"]//a/@href')[0].replace('2.html','')
    url_list = []
    for i in range(1,page+1):
        page_url = info_url.replace('index.html','')+page_list+str(i)+'.html'
        url_list.append(page_url)
    #print(info_text,'共'+str(page)+'页',url_list)
    for page_url in url_list:
        req2 = requests.get(page_url)
        req2.encoding ='gb2312'
        html2 = req2.text
        selector2 = etree.HTML(html2)
        movie_part_urls = selector2.xpath('//div[@class="co_content8"]/ul//a/@href')
        for movie_part_url in movie_part_urls:
            movie_url = root_url + movie_part_url
            req3 = requests.get(movie_url)
            req3.encoding = 'gb2312'
            html3 = req3.text
            selector3 = etree.HTML(html3)
            movie_name = selector3.xpath('//div[@class="co_area2"]//h1/font/text()')[0]
            movie_download_url = selector3.xpath('//div[@class="co_content8"]//table//a/@href')[0].split('/[')[0]
            print(movie_name,movie_download_url)

结果如下

image.png

三、对代码进行分装成函数

import requests
from lxml import etree
from multiprocessing import Pool
#获取阳光电影菜单栏
def get_menu_url(url):
    # 请求阳光电影网站
    req = requests.get(url)
    # 输出请求的状态码
    status_code = req.status_code
    # print(status_code)
    # 输出网页源码
    req.encoding = 'gb2312'
    html = req.text
    selector = etree.HTML(html)
    infos = selector.xpath('//div[@class="contain"]/ul/li[position()<10]/a')
    info_urls = []
    for info in infos:
        info_text = info.xpath('text()')[0]
        if info_text == '经典影片':
            continue
        info_url = root_url + info.xpath('@href')[0]
        info_urls.append(info_url)
    return info_urls

def get_page_url(urls):
    url_list = []
    for url in urls:
        req1 = requests.get(url)
        req1.encoding = 'gb2312'
        html1 = req1.text
        selector1 = etree.HTML(html1)
        page = selector1.xpath('//div[@class="co_content8"]/div[@class="x"]//text()')[1].split('/')[0].replace('共',
                                                                                                               '').replace(
            '页', '').strip()
        page = int(page)
        page_list = selector1.xpath('//div[@class="co_content8"]/div[@class="x"]//a/@href')[0].replace('2.html', '')
        for i in range(1, page + 1):
            page_url = url.replace('index.html', '') + page_list + str(i) + '.html'
            url_list.append(page_url)
    return url_list

def get_movie_url(url):
    root_url = 'http://www.ygdy8.com'
    req2 = requests.get(url)
    req2.encoding = 'gb2312'
    html2 = req2.text
    selector2 = etree.HTML(html2)
    movie_part_urls = selector2.xpath('//div[@class="co_content8"]/ul//a/@href')
    for movie_part_url in movie_part_urls:
        movie_url = root_url + movie_part_url
        req3 = requests.get(movie_url)
        req3.encoding = 'gb2312'
        html3 = req3.text
        selector3 = etree.HTML(html3)
        movie_name = selector3.xpath('//div[@class="co_area2"]//h1/font/text()')[0]
        movie_download_url = selector3.xpath('//div[@class="co_content8"]//table//a/@href')[0]
        print(movie_name, movie_download_url)



if __name__ == '__main__':
    root_url = 'http://www.ygdy8.com'
    ##获取阳光电影菜单栏网址
    menu_urls = get_menu_url(root_url)
    ##获取菜单栏下各分页的网址
    page_urls = get_page_url(menu_urls)
    ##获取电影以及电影网址
    p = Pool(4)
    p.map(get_movie_url, page_urls)

结果如下如图

image.png
上一篇下一篇

猜你喜欢

热点阅读