CrawlSpider爬取某相亲网站数据

2017-11-18 本文已影响339人博行天下

CrawlSpider爬取某相亲网站数据，话不多说，直接上代码

1. 首先创建项目：scrapy startproject HongNiangNet
1. 创建爬虫文件： scrapy genspider -t crawl hongniang 'hongniang.com'
1. Items.py


import scrapy
from scrapy import Field,Item

class HongniangnetItem(Item):
    # define the fields for your item here like:
    # 用户名
    username = Field()
    # 年龄
    age = Field()
    # 头像图片链接
    header_link = Field()
    # 相册图片链接
    images_url = Field()
    # 内心独白
    content = Field()
    # 籍贯
    place_from= Field()
    # 学历
    education = Field()
    # 爱好
    hobby = Field()
    # 个人主页链接
    source_url = Field()
    # 数据来源网站
    source = Field()

1. hongniang.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from HongNiangNet.items import HongniangnetItem

class HongniangSpider(CrawlSpider):
    name = 'hongniang'
    allowed_domains = ['hongniang.com']
    start_urls = ['http://www.hongniang.com/match?&page=1']
    # 每一页匹配规则
    page_links = LinkExtractor(allow=(r"hongniang.com/match?&page=\d+"))
    # 每个人个人主页匹配规则
    profile_links = LinkExtractor(allow=(r"hongniang.com/user/member/id/\d+"))
    rules = (
        # 没有回调函数，说明follow是True
        Rule(page_links),
        # 有回调函数，说明follow是False
        Rule(profile_links, callback='parse_item',follow=True),
    )

    def parse_item(self, response):

        item = HongniangnetItem()
        # 注意：xpath获取位置时，不从0开始
        # 用户名
        item["username"] = self.get_username(response)
        # 年龄
        item["age"] = self.get_age(response)
        # 头像图片链接
        item["header_link"] = self.get_header_link(response)
        # 相册图片链接
        item["images_url"] = self.get_images_url(response)
        # 内心独白
        item["content"] = self.get_content(response)
        # 籍贯
        item["place_from"] = self.get_place_from(response)
        # 学历
        item["education"] = self.get_education(response)
        # 爱好
        item["hobby"] = self.get_hobby(response)
        # 个人主页链接
        item["source_url"] = response.url
        # 数据来源网站
        item["source"] = "hongniang"

        yield item

    def get_username(self,response):
        username = response.xpath("//div[@class='name nickname']/text()").extract()
        if len(username):
            username = username[0]
        else:
            username = "NULL"
        return username.strip()

    def get_age(self,response):
        age = response.xpath("//div[@class='mem_main']/div[@class='sub1']/div[@class='right']/div[@class='info2']/div[1]/ul[1]/li[1]/text()").extract()
        if len(age):
            age = age[0]
            print(age)
        else:
            age = "NULL"
        return age.strip()

    def get_header_link(self,response):
        header_link = response.xpath("//div[@class='mem_main']/div[@class='sub1']/div[@class='left']/div[@id='tFocus']/div[@id='tFocusBtn']/div[@id='tFocus-btn']/ul//img[1]/@src").extract()
        if len(header_link):
            header_link = header_link[0]
        else:
            header_link = "NULL"
        return header_link.strip()

    def get_images_url(self,response):
        images_url = response.xpath("//div[@class='mem_main']/div[@class='sub1']/div[@class='left']/div[@id='tFocus']/div[@id='tFocusBtn']/div[@id='tFocus-btn']/ul//img/@src").extract()
        if len(images_url):
            images_url = images_url
        else:
            images_url = "NULL"
        return images_url

    def get_content(self,response):
        ontent = response.xpath("//div[@class='mem_main']/div[@class='sub1']/div[@class='right']/div[@class='info5']/div[@class='text']/text()").extract()
        if len(ontent):
            ontent = ontent[0]
        else:
            ontent = "NULL"
        return ontent.strip()

    def get_place_from(self,response):
        place_from = response.xpath("//div[@class='mem_main']/div[@class='sub2']/div[@class='info1'][1]/div[@class='right']/ul[2]/li[1]/text()").extract()
        if len(place_from):
            place_from = place_from[0]
        else:
            place_from = "NULL"
        return place_from.strip()

    def get_education(self,response):
        education = response.xpath("//div[@class='mem_main']/div[@class='sub1']/div[@class='right']/div[@class='info2']/div/ul[2]/li[2]/text()").extract()
        if len(education):
            education = education[0]
        else:
            education = "NULL"
        return education.strip()
    def get_hobby(self,response):
        hobby = response.xpath("//div[@class='mem_main']//div[@class='sub2']/div[@class='info1'][2]/div[@class='right'][1]/ul[1]/li[4]/text()").extract()
        if len(hobby):
            hobby = hobby[0]
        else:
            hobby = "NULL"
        return hobby.strip()

1. pipelines.py

# -*- coding: utf-8 -*-

import codecs
import json
class HongniangnetPipeline(object):

    def __init__(self):
        self.filename = codecs.open('content.json', 'w', encoding='utf-8')
    def process_item(self, item, spider):
        html = json.dumps(dict(item),ensure_ascii=False)
        self.filename.write(html + '\n')
        return item

    def spider_closed(self, spider):
        self.filename.close()

1. pipelines.py


USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0)'

ITEM_PIPELINES = {
   'HongNiangNet.pipelines.HongniangnetPipeline': 300,
}

爬取某相亲网站，使用 CrawlSpider爬取某相亲网站数据，获取每位相亲人的部分基本信息，本次分享的爬取数据联系纯属个人练习使用，如果侵犯了某个人的隐私，请第一时间告知，我会全部删除爬取信息。

CrawlSpider爬取某相亲网站数据

CrawlSpider爬取某相亲网站数据，话不多说，直接上代码

猜你喜欢

热点阅读