scrapy-爬取猫眼电影-存储至csv中
2020-06-26 本文已影响0人
lvyz0207
1、新建项目
scrapy startproject maoyan
scrapy genspider maoyan_film https://maoyan.com/
2、编写item
import scrapy
class MaoyanItem(scrapy.Item):
# 电影名称、电影类型和上映时间
film_name = scrapy.Field()
film_type = scrapy.Field()
film_date = scrapy.Field()
3、编写spider
import scrapy
from maoyan.items import MaoyanItem
class MaoyanFilmSpider(scrapy.Spider):
name = 'maoyan_film'
allowed_domains = ['https://maoyan.com/']
start_urls = ["https://maoyan.com/films?showType=3",]
def parse(self, response):
film = MaoyanItem()
print(response.text)
dl = response.xpath('//dl[@class="movie-list"]/dd')
type(dl)
for dd in dl:
# 电影名称
film["film_name"]= "".join(dd.xpath('.//div[@class="channel-detail movie-item-title"]/@title').extract()[0])
# 电影类型
film["film_type"] = "".join(dd.xpath('.//div[1]/div[2]/a/div/div[2]/text()').extract()[1]).strip()
# 上映时间
film["film_date"] = "".join(dd.xpath('.//div[1]/div[2]/a/div/div[4]/text()').extract()[1]).strip()
print(film)
yield film
4、修改pipline
import pandas
class MaoyanPipeline:
def process_item(self, item, spider):
work02_movie = pandas.DataFrame(item.values())
work02_movie.to_csv('./work02_movie.csv', encoding='utf8', index=False, header=False)
return item
5、修改settings
DEFAULT_REQUEST_HEADERS = {
"User-Agent": "",
"Content-Type": "application/x-www-form-urlencoded",
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"Accept-Encoding": "gzip, deflate, br",
"Pragma": "no-cache",
"Host": "maoyan.com",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Sec-Fetch-Site": "none",
"Sec-Fetch-Mode": "navigate",
"Cookie": ''
}
ITEM_PIPELINES = {
'maoyan.pipelines.MaoyanPipeline': 300,
}
6、执行爬虫命令
scrapy crawl maoyan_film
leaf