chezhiwang_spider

2020-07-05  本文已影响0人  overad
#! /usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2020/7/5 15:10
# @File : chezhiwangspider
# @Software: PyCharm


#http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-0-0-0-0-0-7.shtml
#10510



from fake_useragent import UserAgent
ua = UserAgent()

import pymysql
import random
import time

data = {
    'host':'127.0.0.1',
    'port':3306,
    'user':'root',
    'password':'*******',
    'charset':'utf8',
    'db':'chezhiwang'
}

conn = pymysql.connect(**data)
cur = conn.cursor()
sql = "insert into chezhiwang.complaint(complaint_id, car_brand, car_series, car_model, description, topical_prob, cp_tm, cp_status, crawler_tm) " \
      "values (%s,%s,%s,%s,%s,%s,%s,%s,%s)"

import requests
from bs4 import BeautifulSoup as bs
import re
# from w3lib import *
from datetime import datetime

url = 'http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-0-0-0-0-0-7.shtml'

headers = {
    'Host': 'www.12365auto.com',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Connection': 'keep-alive',
    'User-Agent': ua.random,
}

def get_html(url):
    web_data = requests.get(url=url,headers = headers)
    soup = bs(web_data.text,'lxml')
    return soup




if __name__ == '__main__':

    flag = 0

    for i in range(1, 10510):
        url = "http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-0-0-0-0-0-{}.shtml".format(str(i))
        flag += 1

        data = get_html(url)
        nodes = data.select('div.tslb_b table  tr')[1:]

        alist = []

        for node in nodes:
            #投诉编号
            id = node.select('td')[0].text
            #投诉品牌
            car_brand = node.select('td')[1].text
            #投诉车系
            car_series = node.select('td')[2].text
            #投诉车型
            car_model  = node.select('td')[3].text
            #问题简述
            description =  node.select('td')[4].text
            #典型问题
            topical_problem = node.select('td')[5].text
            #投诉时间
            complain_tm = node.select('td')[6].text
            #投诉状态
            complain_status = node.select('td')[7].text

            crawler_tm = str(datetime.now())

            alist.append([id,car_brand,car_series,car_model,description,topical_problem,complain_tm,complain_status,crawler_tm])

        #没十页写入一次数据库
        if flag % 10 == 0:
            # print(alist)
            for i in alist:
                try:
                    cur.execute(sql,((i[0],i[1],i[2],i[3],i[4],i[5],i[6],i[7],i[8])))
                    conn.commit()
                except Exception as e:
                    print(e)

            alist = []
            time.sleep(random.randint(1,3))
        print(flag,datetime.now(),url)

    conn.close()
上一篇 下一篇

猜你喜欢

热点阅读