6、Selenium框架 -- 网页信息定位Demo
2020-04-25 本文已影响0人
波罗的海de夏天
Demo脚本:
# -*- coding:utf-8 -*-
from selenium import webdriver
# 响应链
from selenium.webdriver import ActionChains
import json
import time
import os
# info location
def to_goods_page(driver, web_url):
# 定位到目标网址
driver.get(web_url)
time.sleep(1)
# 点击事件 -- "电脑"
computer_element = driver.find_element_by_link_text('电脑')
time.sleep(1)
# 鼠标悬停
ActionChains(driver).move_to_element(computer_element).perform()
time.sleep(2)
# 点击事件 -- "笔记本"
driver.find_element_by_link_text("笔记本").click()
time.sleep(1)
# 切换句柄
handles = driver.window_handles
index_handle = driver.current_window_handle
for handle in handles:
if handle != index_handle:
driver.switch_to.window(handle)
time.sleep(2)
# 点击事件 -- "thinkpad"
driver.find_element_by_xpath('//*[@id="brand-11518"]/a/img').click()
time.sleep(1)
# 点击事件 -- "7000以上"
driver.find_element_by_xpath('//*[@id="J_selectorPrice"]/div/div[2]/div/ul/li[7]/a').click()
time.sleep(1)
# 点击事件 -- "评论数"
driver.find_element_by_xpath('//*[@id="J_filter"]/div[1]/div[1]/a[3]').click()
time.sleep(1)
# 点击事件 -- 点击一款电脑
driver.find_element_by_xpath('//*[@id="plist"]/ul/li[1]/div/div[1]/a/img').click()
time.sleep(1)
# 切换句柄
notebook_handle = driver.current_window_handle
# 重新获取全部句柄
handles = driver.window_handles
for handle in handles:
if handle != index_handle and handle != notebook_handle:
driver.switch_to.window(handle)
time.sleep(1)
# 滚动'滚动条'
js = 'window.scrollTo(0, 1000)' # px 像素
driver.execute_script(js)
time.sleep(1)
# 选中规则与包装
driver.find_element_by_xpath('//*[@id="detail"]/div[1]/ul/li[2]').click()
time.sleep(1)
# 定位到所有表格中的数据
info_elements = driver.find_elements_by_class_name('Ptable-item')
# 结果信息数据
result_list = []
for info_element in info_elements:
# 解析商品信息,封装成dict数据
info_element_dict = get_info_element_dict(info_element)
result_list.append(info_element_dict)
time.sleep(1)
# 信息保存到文件
save_goods_info(result_list)
# 解析商品信息,封装成dict数据
def get_info_element_dict(info_element):
# 获取信息 -- 第 1 列
computer_part = info_element.find_element_by_tag_name('h3')
# 获取信息 -- 第 2 列
computer_info_keys = info_element.find_elements_by_tag_name('dt')
# 获取信息 -- 第 3 列
# computer_info_values = info_element.find_element_by_tag_name('dd')
computer_info_values = info_element.find_elements_by_xpath('dl//dd[not(contains(@class, "Ptable-tips"))]')
# 信息字典,存储 part
part_dict = {}
# 信息字典,存储 key: value
key_and_value_dict = {}
for i in range(len(computer_info_keys)):
tmp_key = computer_info_keys[i].text
tmp_value = computer_info_values[i].text
key_and_value_dict[tmp_key] = tmp_value
part_dict[computer_part.text] = key_and_value_dict
return part_dict
# 信息保存到文件
def save_goods_info(result_list):
with open(goods_file_path + goods_file_name, 'w', encoding='utf-8') as f:
json.dump(result_list, f, ensure_ascii=False)
# f.write(str(result_list))
if __name__ == '__main__':
# 绝对路径
project_path = os.path.abspath(os.path.curdir)
# 文件路径
goods_file_path = project_path + '/goods_info/'
print('----- goods file path:', goods_file_path)
if not os.path.exists(goods_file_path):
os.mkdir(goods_file_path)
# 文件名 -- 文本文件
goods_file_name = 'computer-2.infos'
time.sleep(1)
# 驱动浏览器
driver = webdriver.Chrome('/Users/****/Desktop/work_note/ai_git/lesson_selenium/chromedriver')
driver.maximize_window()
# 目标网址
web_url = 'https://www.jd.com/'
# info location
to_goods_page(driver, web_url)
# 关闭浏览器
time.sleep(5)
driver.quit()