scrapy爬虫保存数据到mysql
2017-07-20 本文已影响24人
童蒙vlog
直接上例子
# -*- coding: utf-8 -*-
#!/usr/bin/python3
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
from spiderman.env import *
class SpidermanPipeline(object):
def __init__(self):
self.conn = pymysql.connect(
host=dbhost,
user=dbuser,
password=dbpass,
db=dbname,
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
self.cursor = self.conn.cursor()
self.table = table_pool
self.filters = [
'测试',
'demo',
'pre-a',
'概念',
'天使',
'未',
'种子',
'不明确',
'上线',
'正在运营',
]
def process_item(self, item, spider):
qsql = "select count(*) from "+self.table+" where item_id='"+item['item_id'] + "' and item_from='" + item['item_from'] +"'"
self.cursor.execute(qsql) # 返回受影响的行数
count = self.cursor.fetchone()['count(*)']
if count > 0:
print('***********数据重复!***************')
return None
else:
sql = "INSERT INTO "+self.table+" (`item_id`, \
`item_name`, `item_logo`, `item_brief`, `item_area`, \
`item_from`, `item_CEO`,`item_round`,`item_phone`,\
`item_email`,`item_weixin`,`item_weibo`,`item_website`,`item_from_website`,\
`item_address`) \
VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') " % \
(item['item_id'],
item['item_name'],
item['item_logo'],
item['item_brief'],
item['item_area'],
item['item_from'],
item['item_CEO'],
item['item_round'],
item['item_phone'],
item['item_email'],
item['item_weixin'],
item['item_weibo'],
item['item_website'],
item['item_from_website'],
item['item_address'])
for index,value in enumerate(self.filters):
if value in item['item_round'].lower() and item['item_website']:
self.cursor.execute(sql)
self.conn.commit()
print(sql)
break
else:
continue
return item
def close_spider(self,spider):
self.conn.close()
其中env.py文件为常量
# -*- coding: utf-8 -*-
#!/usr/bin/python3
dbuser = 'root'
dbpass = '123456'
dbname = 'testdb'
dbhost = '127.0.0.1'
dbport = '3306'
table_pool='test'