Sqlite3数据库使用
2018-11-01 本文已影响0人
戒灵
1,下载 SQLiteStudio可视化界面(便于手动创建表)
2,我们写一个文件专门对DB
# -*- coding: UTF-8 -*-
import os
import sys
import sqlite3
# sys.path.append('.')
# from common.local_util import *
# import local_util
class DB:
new_db = False
def __init__(self, sqlite3_db='test.db', check_same_thread=False):
self.sqlite3_db = sqlite3_db
self.check_same_thread = check_same_thread
if not os.path.exists(sqlite3_db):
self.new_db = True
self.conn = sqlite3.connect(sqlite3_db, isolation_level=None, check_same_thread=self.check_same_thread)
self.cursor = self.conn.cursor()
self.create_db()
def query(self, sql, param=None):
if param:
res = self.cursor.execute(sql, param)
else:
res = self.cursor.execute(sql)
self.conn.commit()
return res
def create_db(self):
if self.new_db:
self.new_db = False
sql = '''CREATE TABLE amazon_bestseller_category (
category_id INTEGER NOT NULL
REFERENCES amazon_bestseller_list (id),
rank INTEGER,
top_rank INTEGER DEFAULT (0),
review INTEGER DEFAULT (0),
star TEXT DEFAULT (0),
gen_date TEXT DEFAULT (''),
price TEXT DEFAULT (''),
detail_link TEXT NOT NULL,
sale_monthly TEXT DEFAULT (0),
is_active INTEGER DEFAULT (0),
create_time TEXT,
UNIQUE (
category_id,
detail_link
)
)'''
self.query(sql)
sql = '''CREATE TABLE amazon_bestseller_list (
id INTEGER PRIMARY KEY AUTOINCREMENT,
top_category TEXT NOT NULL,
category TEXT DEFAULT (''),
category_url TEXT NOT NULL,
level INTEGER NOT NULL
DEFAULT (0),
is_active INTEGER DEFAULT (0),
UNIQUE (
top_category,
bestseller_link
)
)'''
self.query(sql)
def close(self):
self.conn.close()
3.sqlite3查询语句:
sql = "SELECT a.detail_link,b.top_category FROM amazon_bestseller_category a LEFT JOIN amazon_bestseller_list b ON a.category_id = b.id WHERE (a.top_rank=0 or a.sale_monthly=0) and a.is_active = 1 limit 3000"
4.sqlite3更新语句:
params=(star,dateFirstAvailable,top_rank,sale_monthly,detail_link)
self.db.query(sql, params)
5.sqlite3插入语句
params = (self.category_id,rank,review,link,price,'1',create_time)
self.db.query(sql, params)
6,最终例子:(查询两个表中的关联字段及想要的字段并更新3000条)
# -*- coding: utf-8 -*-
# import re
# import json
#import MySQLdb
import time
import datetime
import os
import sys
import argparse
# sys.path.append('.')
# from common import local_db, local_util
import scrapy
import requests
from scrapy.crawler import CrawlerProcess
from scrapy.crawler import CrawlerRunner
from scrapy.spiders import BaseSpider
from scrapy.spiders.init import InitSpider
from scrapy.http import Request, FormRequest
from scrapy.http.request.form import _get_form,_get_inputs
from scrapy.linkextractors import LinkExtractor
from scrapy.exceptions import CloseSpider
from scrapy.spiders import Rule
from scrapy.selector import Selector
from scrapy.http.cookies import CookieJar
import local_db
# 以下打包的时候必须
# import robotparser
import scrapy.spiderloader
import scrapy.statscollectors
import scrapy.logformatter
import scrapy.dupefilters
import scrapy.squeues
import scrapy.extensions.spiderstate
import scrapy.extensions.corestats
import scrapy.extensions.telnet
import scrapy.extensions.logstats
import scrapy.extensions.memusage
import scrapy.extensions.memdebug
import scrapy.extensions.feedexport
import scrapy.extensions.closespider
import scrapy.extensions.debug
import scrapy.extensions.httpcache
import scrapy.extensions.statsmailer
import scrapy.extensions.throttle
import scrapy.core.scheduler
import scrapy.core.engine
import scrapy.core.scraper
import scrapy.core.spidermw
import scrapy.core.downloader
import scrapy.downloadermiddlewares.stats
import scrapy.downloadermiddlewares.httpcache
import scrapy.downloadermiddlewares.cookies
import scrapy.downloadermiddlewares.useragent
import scrapy.downloadermiddlewares.httpproxy
import scrapy.downloadermiddlewares.ajaxcrawl
# import scrapy.downloadermiddlewares.chunked
import scrapy.downloadermiddlewares.decompression
import scrapy.downloadermiddlewares.defaultheaders
import scrapy.downloadermiddlewares.downloadtimeout
import scrapy.downloadermiddlewares.httpauth
import scrapy.downloadermiddlewares.httpcompression
import scrapy.downloadermiddlewares.redirect
import scrapy.downloadermiddlewares.retry
import scrapy.downloadermiddlewares.robotstxt
import scrapy.spidermiddlewares.depth
import scrapy.spidermiddlewares.httperror
import scrapy.spidermiddlewares.offsite
import scrapy.spidermiddlewares.referer
import scrapy.spidermiddlewares.urllength
import scrapy.pipelines
import scrapy.core.downloader.handlers.http
import scrapy.core.downloader.contextfactory
class AmazonBestsellerCategorySpider(InitSpider):
name = 'amazon_bestseller_category'
allowed_domains = ['www.amazon.com']
try_login_max_time = 3
def __init__(self, page_url=None, rank_max='20'):
self.page_url = page_url
self.rank_max = int(rank_max)
self.db = local_db.DB('./data/data.db')
sql = "select id, category_url from amazon_bestseller_list where category_url=?"
cur = self.db.query(sql, (page_url,))
row = cur.fetchone()
# self.log(row)
self.category_id = row[0] if row else 0
def init_request(self):
yield Request(url=self.page_url,callback=self.list_page)
def list_page(self,response):
sel = Selector(response)
row_ele = sel.xpath("//ol[@id='zg-ordered-list']//li")
for row in row_ele:
rank = row.xpath(".//span/span[contains(@class,'zg-badge-text')]/text()").extract_first()
rank = rank.replace('#', '')
review = row.xpath(".//span[contains(@class,'aok-inline-block')]/div[contains(@class,'a-icon-row')]/a[contains(@class,'a-size-small')]/text()").extract_first()
if not review:
review = '0'
price = row.xpath(".//span[contains(@class,'p13n-sc-price')]/text()").extract_first()
detail_url = row.xpath(".//span[contains(@class,'zg-item')]/a[@class='a-link-normal']/@href").extract_first()
detail_url_pase = response.urljoin(detail_url)
link_dp = detail_url_pase[detail_url_pase.find('/dp/'):]
link_ref =response.urljoin(link_dp)
link = link_ref[:link_ref.find('/ref')]
create_time=datetime.datetime.now().strftime('%Y-%m-%d')
if int(rank) > self.rank_max:
continue
sql = "INSERT OR REPLACE into amazon_bestseller_category(category_id,rank,review,detail_link,price,is_active,create_time) values(?,?,?,?,?,?,?)"
params = (self.category_id,rank,review,link,price,'1',create_time)
self.db.query(sql, params)
if __name__ == "__main__":
script_path = os.path.split( os.path.realpath( sys.argv[0] ) )[0]
os.chdir(script_path)
start_at = time.time()
if not os.path.exists('./data'):
os.mkdir('./data')
parser = argparse.ArgumentParser(description='Arguments')
parser.add_argument('--page_url', help='Page_url', required=True)
parser.add_argument('--rank_max', help='rank_max', required=False, default='20')
args = vars(parser.parse_args())
page_url = args['page_url']
rank_max = args['rank_max']
params = {'page_url':page_url,'rank_max':rank_max}
CrawlSettings = {
'BOT_NAME': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
'DOWNLOAD_TIMEOUT' : 60,
'DOWNLOAD_DELAY': 3,
'DEFAULT_REQUEST_HEADERS': {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
},
'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
}
process = CrawlerProcess(CrawlSettings)
process.crawl(AmazonBestsellerCategorySpider,**params)
process.start()
end_at = time.time()
print( end_at-start_at)