淘宝运用selenium+无头浏览器爬取并存入数据库
2020-05-10 本文已影响0人
公元2094年
代码:
import re
import pymysql
from pyquery import PyQuery as pq
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
option = Options()
# option.add_argument("--headless")
option.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
driver = webdriver.Chrome(chrome_options=option)
wait = WebDriverWait(driver,10)
def search():
try:
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR,"#q"))
)
submit = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_SearchForm > button"))
)
input.clear()
input.send_keys('美食')
submit.click()
total = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > div.total"))
)
get_product()
return total.text
except TimeoutException:
return search()
def next_page(page_n):
print("翻页")
try:
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input"))
)
submit = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit"))
)
input.clear()
input.send_keys(page_n)
submit.click()
wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#mainsrp-pager > div > div > div > ul > li.item.active > span"),str(page_n)))
get_product()
except TimeoutException:
next_page(page_n)
def main():
total = search()
total=int(re.compile('(\d+)').search(total).group(1))
for i in range(2,total+1):
next_page(i)
def get_product():
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#mainsrp-itemlist .items .item")))
html = driver.page_source
doc = pq(html)
items = doc("#mainsrp-itemlist .items .item").items()
print("开始爬取")
for item in items:
# product = {
image=item.find('.pic .img').attr("src")
price=item.find('.price').text()
deal=item.find('.deal-cnt').text()
title=item.find('.title').text()
shop=item.find('.shopname').text()
location=item.find('.location').text()
# }
coon = pymysql.connect(host='qxx2094.cn', user='root', password='qxxmysql', database="test",port=3306)
cursor = coon.cursor()
sql ="""
INSERT INTO sj(image,price,deal,title,shop,location)
VALUES (%s,%s,%s,%s,%s,%s)
"""
try:
cursor.execute(sql,(image,price,deal,title,shop,location))
coon.commit()
print("插入成功")
except:
coon.rollback()
coon.close()
print("插入失败")
if __name__ == '__main__':
main()