爬虫scrapy-1
2018-07-25 本文已影响21人
灵动的小猪
这个爬取的是高校老师的信息,具体是哪可以看代码
因为我使用的是scrapy需要python2,我还没有尝试python3,所以我使用mimaconda来创建不同的python环境
conda 创建新环境
scrapy startproject beijingplant
cd beijingplant/beijingplant/spider
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class WeiminItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
link = scrapy.Field()
link2 = scrapy.Field()
email=scrapy.Field()
position=scrapy.Field()
pass
pipaline.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#以json格式输出
#from scrapy.exporters import JsonItemExporter
#以jl格式输出
#from scrapy.exporters import JsonLinesItemExporter
#以csv格式输出
from scrapy.exporters import CsvItemExporter
class WeiminPipeline(object):
def open_spider(self, spider):
#可选实现,当spider被开启时,这个方法被调用。
#输出到weimin.csv文件
self.file = open('weimin.csv', 'wb')
self.exporter = CsvItemExporter(self.file, encoding='utf-8')
self.exporter.start_exporting()
def close_spier(selef, spider):
#可选实现,当spider被关闭时,这个方法被调用
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
beijingplant.py
import scrapy
from weimin.items import WeiminItem
class ZhanSpider(scrapy.Spider):
name='zhang'
allowed_domains=['ib.cas.cn']
start_urls=['http://www.ib.cas.cn/duiwu/']
def parse(self,response):
items=WeiminItem()
for url_list in response.xpath('//td[@width="134"]'):
items['link']=url_list.xpath("./a/@href")[0].extract()
url1='http://www.ib.cas.cn/duiwu//'+items['link']
yield scrapy.Request(url1,callback=self.parse_name1)
def parse_name1(self,response):
items=WeiminItem()
for url2_list in response.xpath("//td[@valign='top']/div/div//table/tbody/tr/td//a/@href").extract():
yield scrapy.Request(url2_list,callback=self.parse_name2)
def parse_name2(self,response):
items=WeiminItem()
for url3_list in response.xpath('.//table[@bgcolor="#e7e7e7"]/tbody/tr[2]'):
items['name'] = url3_list.xpath('.//p[@style="LINE-HEIGHT: 150%; MARGIN-TOP: 0px; MARGIN-BOTTOM: 0px"]/text()')[0].extract().encode("utf-8")
if len(url3_list.xpath('//*[@id="table25"]/tbody/tr[3]/td/a/text()')):
items['email'] = url3_list.xpath('//*[@id="table25"]/tbody/tr[3]/td/a/text()')[0].extract().encode("utf-8")
items['position'] = url3_list.xpath('//*[@id="table25"]/tbody/tr[1]/td/text()')[0].extract().encode("utf-8")
yield items
最后修改setting.py就可以运行了