爬虫练手:使用scrapy抓取当当网程序设计类图书信息,并保存到
2016-12-28 本文已影响820人
BlueCat2016
爬取目标
当当网程序设计类图书信息,网址为:
http://category.dangdang.com/cp01.54.06.00.00.00.html
开发环境
python3.5/MySQL 5.6/scrapy 1.3
python 运行在windows上,MySQL运行在centos6.7
源代码
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class Dangdang01Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field() # 标题
link = scrapy.Field() # 链接
comment = scrapy.Field() # 评论数
dangdang.py (爬虫程序)
# -*- coding: utf-8 -*-
import scrapy
from dangdang01.items import Dangdang01Item
from scrapy.http import Request
class DangdangSpider(scrapy.Spider):
name = "dangdang"
allowed_domains = ["dangdang.com"]
start_urls = [
'http://category.dangdang.com/cp01.54.06.00.00.00-srsort_score_desc-f0%7C0%7C0%7C0%7C0%7C1%7C0%7C0%7C0%7C0%7C0%7C0%7C0-shlist.html']
def parse(self, response):
title = response.xpath("//p[@class='name']/a/text()").extract()
link = response.xpath("//a[@class='pic']/@href").extract()
comment = response.xpath("//a[@name='P_pl']/text()").extract()
for i in range(0,len(title)):
dd = Dangdang01Item()
dd["title"] = title[i]
dd["link"] = link[i]
dd["comment"] = comment[i]
# print(dd["title"])
# print(dd["link"])
# print(dd["comment"])
yield dd
for i in range(1, 101):
url = "http://category.dangdang.com/pg" + str(
i) + "-cp01.54.06.00.00.00-srsort_score_desc-f0%7C0%7C0%7C0%7C0%7C1%7C0%7C0%7C0%7C0%7C0%7C0%7C0-shlist.html"
yield Request(url, callback=self.parse)
建立数据库和表
Paste_Image.png查看权限分配
Paste_Image.pngpipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
import sys
class Dangdang01Pipeline(object):
def __init__(self):
# load(sys)
self.conn = pymysql.connect(host="192.168.1.188",user="root",password="654321",db="dangdang")
self.conn.set_charset("utf8")
def process_item(self, item, spider):
title = item["title"]
link = item["link"]
comment = item["comment"]
print(item["title"])
print(item["link"])
print(item["comment"])
# return item
sql = "insert into book(title,link,commit) values ('" + title + "','" + link + "','" + comment + "');"
print(sql)
cursor = self.conn.cursor()
try:
cursor.execute(sql)
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
# pass
def close_spider(self):
self.conn.close()