爬虫scrapy-1

2018-07-25  本文已影响21人  灵动的小猪

这个爬取的是高校老师的信息,具体是哪可以看代码
因为我使用的是scrapy需要python2,我还没有尝试python3,所以我使用mimaconda来创建不同的python环境
conda 创建新环境

scrapy startproject beijingplant
cd beijingplant/beijingplant/spider

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class WeiminItem(scrapy.Item):
    # define the fields for your item here like:
    name = scrapy.Field()
    link = scrapy.Field()
    link2 = scrapy.Field()
    email=scrapy.Field()
    position=scrapy.Field()
    pass

pipaline.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

#以json格式输出
#from scrapy.exporters import JsonItemExporter
#以jl格式输出
#from scrapy.exporters import JsonLinesItemExporter
#以csv格式输出
from scrapy.exporters import CsvItemExporter

class WeiminPipeline(object):

    def open_spider(self, spider):
        #可选实现,当spider被开启时,这个方法被调用。
        #输出到weimin.csv文件
        self.file = open('weimin.csv', 'wb')
        self.exporter = CsvItemExporter(self.file, encoding='utf-8')
        self.exporter.start_exporting()
    def close_spier(selef, spider):
        #可选实现,当spider被关闭时,这个方法被调用
        self.exporter.finish_exporting()
        self.file.close()
    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

beijingplant.py

import scrapy
from weimin.items import WeiminItem

class ZhanSpider(scrapy.Spider):
        name='zhang'
        allowed_domains=['ib.cas.cn']
        start_urls=['http://www.ib.cas.cn/duiwu/']

        def parse(self,response):
                items=WeiminItem()
                for url_list in response.xpath('//td[@width="134"]'):
                        items['link']=url_list.xpath("./a/@href")[0].extract()
                        url1='http://www.ib.cas.cn/duiwu//'+items['link']
                        yield scrapy.Request(url1,callback=self.parse_name1)

        def parse_name1(self,response):
                items=WeiminItem()
                for url2_list in response.xpath("//td[@valign='top']/div/div//table/tbody/tr/td//a/@href").extract():
                        yield scrapy.Request(url2_list,callback=self.parse_name2)

        def parse_name2(self,response):
                items=WeiminItem()
                for url3_list in response.xpath('.//table[@bgcolor="#e7e7e7"]/tbody/tr[2]'):
                        items['name'] = url3_list.xpath('.//p[@style="LINE-HEIGHT: 150%; MARGIN-TOP: 0px; MARGIN-BOTTOM: 0px"]/text()')[0].extract().encode("utf-8")
                        if len(url3_list.xpath('//*[@id="table25"]/tbody/tr[3]/td/a/text()')):
                                items['email'] = url3_list.xpath('//*[@id="table25"]/tbody/tr[3]/td/a/text()')[0].extract().encode("utf-8")
                        items['position'] = url3_list.xpath('//*[@id="table25"]/tbody/tr[1]/td/text()')[0].extract().encode("utf-8")
                yield items

最后修改setting.py就可以运行了

上一篇下一篇

猜你喜欢

热点阅读