爬取网站
2018-02-11 本文已影响135人
whong736
1.新建项目
scrapy startproject ts
2.新建爬虫
cd ts
scrapy genspider -t basic leason hellobi.com
3.新建python文件main
from scrapy import cmdline
cmdline.execute("scarpy crawl leason".split())
image.png
4.编写item文件定义需要抓取的字段名
import scrapy
class TsItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
stu = scrapy.Field()
5.编写爬虫
#导入item模块到爬虫
# -*- coding: utf-8 -*-
import scrapy
from ts.items import TsItem
from scrapy.http import Request
class LeasonSpider(scrapy.Spider):
name = 'leeson'
allowed_domains = ['hellobi.com']
start_urls = ['https://edu.hellobi.com/course/1']
def parse(self, response):
item = TsItem()
item["title"] = response.xpath("//ol[@class='breadcrumb']/li[@class='active']/text()").extract()
item["link"] = response.xpath("//ul[@class='nav nav-tabs']/li[@class='active']/a/@href").extract()
item["stu"] = response.xpath("//span[@class='course-view']/text()").extract()
yield item
#循环抓取范围内的内容
for i in range(2,255):
url = "https://edu.hellobi.com/course/" + str(i)
yield Request(url,callback= self.parse)
6.setting修改 ITEM_PIPELINES 取消注释
ITEM_PIPELINES = {
'ts.pipelines.TsPipeline': 300,
}
7.编写Pipeline文件
class TsPipeline(object):
def process_item(self, item, spider):
#打印测试
print(item["title"])
print(item["link"])
print(item["stu"])
print("-------------")
return item
8.执行命令scrapy crawl Leeson --nolog 运行效果:
image.png
9.继续改进Pipeline的代码,写入文件
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
class TsPipeline(object):
#新增--第一个执行的构造函数,open中的参数a为追加的方式写入。需要在构造函数外使用fh需要前面self
def __init__(self):
self.fh = open("/Users/vincentwen/ts/leesons.txt","a")
def process_item(self, item, spider):
print(item["title"])
print(item["link"])
print(item["stu"])
print("-------------")
#新增,去除数组的第一个。
self.fh.write(item["title"][0]+"\n"+item["link"][0]+"\n"+item["stu"][0]+"\n"+"-------------"+"\n")
return item
#新增---最后执行关闭的文件的函数
def close_spider(self):
self.fh.close()
重新运行爬虫命令
scrapy crawl leeson --nolog
打开文件,可以查看到写入的信息
image.png