爬取梨视频2018-11-02
2018-11-02 本文已影响0人
Mr_Du_Biao
爬虫文件
# -*- coding: utf-8 -*-
import scrapy
import time
from selenium import webdriver
from lxml import etree
import re
from liVedioPro.items import LivedioproItem
class LidemoSpider(scrapy.Spider):
name = 'liDemo'
#allowed_domains = ['www.xxx.com']
start_urls = ['http://www.pearvideo.com/category_6']
#获取首页更多视频数据
def getpageSource(self):
bro = webdriver.PhantomJS(
executable_path=r'C:\Users\Administrator\Desktop\12期爬虫授课\part_one\4.selenium&phantomjs\phantomjs-2.1.1-windows\bin\phantomjs.exe')
bro.get(url=self.start_urls[0])
time.sleep(3)
js = "window.scrollTo(0,document.body.scrollHeight)"
bro.execute_script(js)
time.sleep(2)
bro.execute_script(js)
time.sleep(2)
a = bro.find_element_by_id("listLoadMore")
if a:
a.click()
# 获取的更多的视频数据对应的页面数据
page_text = bro.page_source
return page_text
#1.使用了selenium加载出了更多的页面数据,获取了该页面数据,然后进行了解析(二级子页面的url)操作,对解析出的url发起了请求
def start_requests(self):
#获取的更多的视频数据对应的页面数据
page_text = self.getpageSource()
#解析页面数据中的url
urls_list = self.myParse(page_text)
for url in urls_list:
yield scrapy.Request(url=url,callback=self.getSecondPage)
#获取视频和视频名字
def getVideoData(self,response):
videoData = response.body
item = LivedioproItem()
item['videoData'] = videoData
item['name'] = response.url.split('/')[-1]
yield item
#获取二级页面数据
def getSecondPage(self,response):
page_text = response.text
video_url = re.findall('srcUrl="(.*?)",',page_text,re.S)[0]
yield scrapy.Request(url=video_url,callback=self.getVideoData)
#在首页面也中解析出所有的视频二级子页面的url
def myParse(self,pageText):
tree = etree.HTML(pageText)
li_list = tree.xpath('//li[@class="categoryem"]')
urls_list = []
for li in li_list:
if not li.xpath('./div/a/@href'):
continue
secondPage_url ="http://www.pearvideo.com/"+li.xpath('./div/a/@href')[0]
urls_list.append(secondPage_url)
return urls_list
item
import scrapy
class LivedioproItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
videoData = scrapy.Field()
管道文件
import os
class LivedioproPipeline(object):
def open_spider(self,spider):
#创建一个文件夹
if not os.path.exists('PearVideo'):
os.mkdir('PearVideo')
def process_item(self, item, spider):
filePath = 'PearVideo/'+item['name']
with open(filePath,'wb') as fp:
fp.write(item['videoData'])
print(filePath+" 下载成功!")
return item