selenium+xpath 爬取京东商品信息

2018-07-16 本文已影响0人把握_cc79

这是一个没有翻页处理的爬取，可以小修改下，实现隔壁淘宝信息抓取的翻页处理

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from lxml import etree


def search():
    try:
        browser.get('https://www.jd.com/')
        input = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="key"]')))
        # submit = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@class="button"]')))
        input.clear()
        input.send_keys('鞋子', Keys.ENTER)
        # submit.click()
        total = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="J_bottomPage"]/span[2]/em[1]/b')))
        get_products()
        return total.text
    except TimeoutException:
        return search()

def get_products():
    wait.until(EC.presence_of_element_located((By.XPATH, '//*[@class="page clearfix"]')))
    html = browser.page_source
    html = etree.HTML(html)

    images = html.xpath('//*[@id="J_goodsList"]/ul/li/div/div[1]/a/img/@src')
    comment = html.xpath('//*[@class="p-commit"]//a/text()')
    name = html.xpath('//*[@class="curr-shop"]/@title')

    for i in range(len(name)):
        temp = {
            'images': images,
            'comment': comment,
            'name': name,
        }
    print(temp)

if __name__ == '__main__':
    browser = webdriver.Chrome()
    wait = WebDriverWait(browser, 10)
    search()

selenium+xpath 爬取京东商品信息

猜你喜欢

热点阅读