Sqlite3数据库使用

2018-11-01  本文已影响0人  戒灵

1,下载 SQLiteStudio可视化界面(便于手动创建表)
2,我们写一个文件专门对DB

# -*- coding: UTF-8 -*-

import os
import sys
import sqlite3

# sys.path.append('.')
# from common.local_util import *
# import local_util

class DB:
    new_db = False

    def __init__(self, sqlite3_db='test.db', check_same_thread=False):
        self.sqlite3_db        = sqlite3_db
        self.check_same_thread = check_same_thread

        if not os.path.exists(sqlite3_db):
            self.new_db = True

        self.conn = sqlite3.connect(sqlite3_db, isolation_level=None, check_same_thread=self.check_same_thread)
        self.cursor = self.conn.cursor()
        self.create_db()

    def query(self, sql, param=None):
        if param:
            res = self.cursor.execute(sql, param)
        else:
            res = self.cursor.execute(sql)
        self.conn.commit()
        return res

    def create_db(self):
        if self.new_db:
            self.new_db = False
            sql = '''CREATE TABLE amazon_bestseller_category (
                category_id  INTEGER NOT NULL
                                     REFERENCES amazon_bestseller_list (id),
                rank         INTEGER,
                top_rank     INTEGER DEFAULT (0),
                review       INTEGER DEFAULT (0),
                star         TEXT    DEFAULT (0),
                gen_date     TEXT    DEFAULT (''),
                price        TEXT    DEFAULT (''),
                detail_link  TEXT    NOT NULL,
                sale_monthly TEXT    DEFAULT (0),
                is_active    INTEGER DEFAULT (0),
                create_time  TEXT,
                UNIQUE (
                    category_id,
                    detail_link
                )
            )'''
            self.query(sql)
            sql = '''CREATE TABLE amazon_bestseller_list (
                id              INTEGER PRIMARY KEY AUTOINCREMENT,
                top_category    TEXT    NOT NULL,
                category        TEXT    DEFAULT (''),
                category_url TEXT    NOT NULL,
                level           INTEGER NOT NULL
                                        DEFAULT (0),
                is_active       INTEGER DEFAULT (0),
                UNIQUE (
                    top_category,
                    bestseller_link
                )
            )'''
            self.query(sql)

    def close(self):
        self.conn.close()

3.sqlite3查询语句:

sql = "SELECT a.detail_link,b.top_category FROM amazon_bestseller_category a LEFT JOIN amazon_bestseller_list b ON a.category_id = b.id WHERE (a.top_rank=0 or a.sale_monthly=0) and a.is_active = 1 limit 3000"

4.sqlite3更新语句:

        params=(star,dateFirstAvailable,top_rank,sale_monthly,detail_link)
        self.db.query(sql, params)

5.sqlite3插入语句

            params = (self.category_id,rank,review,link,price,'1',create_time)
            self.db.query(sql, params)

6,最终例子:(查询两个表中的关联字段及想要的字段并更新3000条)

# -*- coding: utf-8 -*-

# import re
# import json
#import MySQLdb
import time
import datetime
import os
import sys
import argparse

# sys.path.append('.')
# from common import local_db, local_util

import scrapy
import requests
from scrapy.crawler import CrawlerProcess
from scrapy.crawler import CrawlerRunner
from scrapy.spiders import BaseSpider
from scrapy.spiders.init import InitSpider
from scrapy.http import Request, FormRequest
from scrapy.http.request.form import _get_form,_get_inputs
from scrapy.linkextractors import LinkExtractor
from scrapy.exceptions import CloseSpider
from scrapy.spiders import Rule
from scrapy.selector import Selector
from scrapy.http.cookies import CookieJar

import local_db

# 以下打包的时候必须
# import robotparser

import scrapy.spiderloader
import scrapy.statscollectors
import scrapy.logformatter
import scrapy.dupefilters
import scrapy.squeues

import scrapy.extensions.spiderstate
import scrapy.extensions.corestats
import scrapy.extensions.telnet
import scrapy.extensions.logstats
import scrapy.extensions.memusage
import scrapy.extensions.memdebug
import scrapy.extensions.feedexport
import scrapy.extensions.closespider
import scrapy.extensions.debug
import scrapy.extensions.httpcache
import scrapy.extensions.statsmailer
import scrapy.extensions.throttle

import scrapy.core.scheduler
import scrapy.core.engine
import scrapy.core.scraper
import scrapy.core.spidermw
import scrapy.core.downloader

import scrapy.downloadermiddlewares.stats
import scrapy.downloadermiddlewares.httpcache
import scrapy.downloadermiddlewares.cookies
import scrapy.downloadermiddlewares.useragent
import scrapy.downloadermiddlewares.httpproxy
import scrapy.downloadermiddlewares.ajaxcrawl
# import scrapy.downloadermiddlewares.chunked
import scrapy.downloadermiddlewares.decompression
import scrapy.downloadermiddlewares.defaultheaders
import scrapy.downloadermiddlewares.downloadtimeout
import scrapy.downloadermiddlewares.httpauth
import scrapy.downloadermiddlewares.httpcompression
import scrapy.downloadermiddlewares.redirect
import scrapy.downloadermiddlewares.retry
import scrapy.downloadermiddlewares.robotstxt

import scrapy.spidermiddlewares.depth
import scrapy.spidermiddlewares.httperror
import scrapy.spidermiddlewares.offsite
import scrapy.spidermiddlewares.referer
import scrapy.spidermiddlewares.urllength

import scrapy.pipelines

import scrapy.core.downloader.handlers.http
import scrapy.core.downloader.contextfactory


class AmazonBestsellerCategorySpider(InitSpider):
    name = 'amazon_bestseller_category'
    allowed_domains = ['www.amazon.com']
    try_login_max_time = 3

    def __init__(self, page_url=None, rank_max='20'):
        self.page_url = page_url
        self.rank_max = int(rank_max)
        self.db = local_db.DB('./data/data.db')
        sql = "select id, category_url from amazon_bestseller_list where category_url=?"
        cur = self.db.query(sql, (page_url,))
        row = cur.fetchone()
        # self.log(row)
        self.category_id = row[0] if row else 0

    def init_request(self):
        yield Request(url=self.page_url,callback=self.list_page)

    def list_page(self,response):
        sel = Selector(response)
        row_ele = sel.xpath("//ol[@id='zg-ordered-list']//li")
        for row in row_ele:
            rank = row.xpath(".//span/span[contains(@class,'zg-badge-text')]/text()").extract_first()
            rank = rank.replace('#', '')
            review = row.xpath(".//span[contains(@class,'aok-inline-block')]/div[contains(@class,'a-icon-row')]/a[contains(@class,'a-size-small')]/text()").extract_first()
            if not review:
                review = '0'
            price = row.xpath(".//span[contains(@class,'p13n-sc-price')]/text()").extract_first()
            detail_url = row.xpath(".//span[contains(@class,'zg-item')]/a[@class='a-link-normal']/@href").extract_first()
            detail_url_pase = response.urljoin(detail_url)
            link_dp = detail_url_pase[detail_url_pase.find('/dp/'):]
            link_ref =response.urljoin(link_dp)
            link = link_ref[:link_ref.find('/ref')]
            create_time=datetime.datetime.now().strftime('%Y-%m-%d')

            if int(rank) > self.rank_max:
                continue
            sql = "INSERT OR REPLACE into amazon_bestseller_category(category_id,rank,review,detail_link,price,is_active,create_time) values(?,?,?,?,?,?,?)"
            params = (self.category_id,rank,review,link,price,'1',create_time)
            self.db.query(sql, params)


if __name__ == "__main__":
    script_path = os.path.split( os.path.realpath( sys.argv[0] ) )[0]
    os.chdir(script_path)
    start_at = time.time()

    if not os.path.exists('./data'):
        os.mkdir('./data')

    parser = argparse.ArgumentParser(description='Arguments')
    parser.add_argument('--page_url', help='Page_url', required=True)
    parser.add_argument('--rank_max', help='rank_max', required=False, default='20')
    args = vars(parser.parse_args())
    page_url = args['page_url']
    rank_max = args['rank_max']
    params = {'page_url':page_url,'rank_max':rank_max}
    CrawlSettings = {
        'BOT_NAME': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
        'DOWNLOAD_TIMEOUT' : 60,
        'DOWNLOAD_DELAY': 3,
        'DEFAULT_REQUEST_HEADERS': {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en',
        },
        'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
    }
    process = CrawlerProcess(CrawlSettings)
    process.crawl(AmazonBestsellerCategorySpider,**params)
    process.start()

    end_at = time.time()
    print( end_at-start_at)
上一篇下一篇

猜你喜欢

热点阅读