利用Selenium爬取京东某店铺商品信息
2019-12-03 本文已影响0人
小T数据站
由于近期有一个每日记录京东史密斯旗舰店里所有商品信息的需求,就琢磨着写了一段脚本实现了半自动获取相关信息。
为什么是半自动呢,因为史密斯的商品总共有6页,我将它们的URL直接复制粘贴进了脚本里,没有想办法去自动获取其URL。
不过就如邓小平爷爷所说:“不管黑猫白猫,能捉老鼠的就是好猫”,以下是实现此次需求的代码:
#!/usr/bin/python
# -*- coding:utf-8 -*-
from selenium import webdriver
from lxml import etree
import time,re,csv
# 新建存储csv文件
fp = open('/Users/mason_tao/Desktop/sms.csv','a',newline='',encoding='utf_8_sig')
writer = csv.writer(fp)
# 写入列名
writer.writerow(("商品标题","商品链接","促销价","原价"))
# 获取每页url里的商品详情页url
def get_goods_DetailUrl(url):
# 将Chrome设置为无头模式
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(options=chrome_options)
# browser = webdriver.Chrome()
browser.get(url)
# 延时1秒让网页加载完成
time.sleep(1)
source = browser.page_source
html = etree.HTML(source)
lis = html.xpath("//div[@class='j-module']/ul/li")
goods_urls = []
for li in lis:
goods_url_1 = li.xpath(".//div[@class='jDesc']/a/@href")[0]
goods_url = 'https:' + goods_url_1
goods_urls.append(goods_url)
return goods_urls
browser.close()
# 在详情页获取商品标题、促销价、原价
def get_goods_Info(goods_urls):
for goods_url in goods_urls:
# browser = webdriver.Chrome()
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(options=chrome_options)
print(goods_url)
# browser = webdriver.Chrome()
browser.get(goods_url)
# browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
time.sleep(1)
goods_source = browser.page_source
goods_html = etree.HTML(goods_source)
title = goods_html.xpath("//div[@class='sku-name']/text()")[-1].strip()
Goods_url = goods_url
if goods_html.xpath("//span[@class='p-price']/span[2]/text()") == []:
price = []
else:
price = goods_html.xpath("//span[@class='p-price']/span[2]/text()")[0]
if goods_html.xpath("//div[@class='item hide']/text()") ==[]:
hide_text = ''
else:
hide_text = goods_html.xpath("//div[@class='item hide']/text()")[0]
pattern = re.compile(".*?到手.*?([0-9]{4})元.*?")
if re.findall(pattern, hide_text) == []:
text_price = price
else:
text_price = re.findall(pattern, hide_text)[0]
goods_list = [title,Goods_url,text_price,price]
writer.writerow(goods_list)
browser.close()
time.sleep(1)
def main():
urls = ['https://mall.jd.com/view_search-396630-0-99-1-24-1.html',
'https://aosmith1.jd.com/view_search-396630-0-5-1-24-2.html']
for url in urls:
detail_urls = (get_goods_DetailUrl(url))
get_goods_Info(detail_urls)
if __name__ =='__main__':
main()
下图为爬取的部分结果:
史密斯商品