Python网络爬虫实战之九:Selenium进阶操作与爬取京东
2018-08-13 本文已影响491人
27efec53a72d
目录:Python网络爬虫实战系列
- Python网络爬虫实战之一:网络爬虫理论基础
- Python网络爬虫实战之二:环境部署、基础语法、文件操作
- Python网络爬虫实战之三:基本工具库urllib和requests
- Python网络爬虫实战之四:BeautifulSoup
- Python网络爬虫实战之五:正则表达式
- Python网络爬虫实战之六:静态网页爬取案例实战
- Python网络爬虫实战之七:动态网页爬取案例实战 Selenium + PhantomJS
- Python网络爬虫实战之八:动态网页爬取案例实战 Selenium + Headless Chrome
- Python网络爬虫实战之九:Selenium进阶操作与爬取京东商品评论
- Python网络爬虫实战之十:利用API进行数据采集
- Python网络爬虫实战之十一:Scrapy爬虫框架入门介绍
- Python网络爬虫实战之十二:Scrapy爬虫三个实战小案例
- Python网络爬虫实战之十三:Scrapy爬取名侦探柯南漫画集
- Python网络爬虫实战之十四:Scrapy结合scrapy-splash爬取动态网页数据
正文:
一、Selenium进阶操作
1、回顾 Selenium 打开有界面的浏览器
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
browser = webdriver.Chrome()
try:
browser.get('https://www.baidu.com')
input = browser.find_element_by_id('kw')
input.send_keys('Python')
input.send_keys(Keys.ENTER)
wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.ID, 'content_left')))
print(browser.current_url)
print(browser.get_cookies())
print(browser.page_source)
finally:
browser.close()
2、回顾 Selenium 打开无界面的浏览器
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless') # 设置headless模型
chrome_options.add_argument("--disable-gpu")
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get('https://www.baidu.com')
print(driver.page_source)
driver.close()
3、页面交互:模拟人工在淘宝上搜索
from selenium import webdriver
import time
browser = webdriver.Chrome()
browser.get("http://www.taobao.com")
input_str = browser.find_element_by_id('q')
input_str.send_keys("ipad")
time.sleep(1)
input_str.clear()
input_str.send_keys("macBook pro")
button = browser.find_element_by_xpath('//*[@id="J_TSearchForm"]/div[1]')
button.click()
4、动作链: 模拟人工拖拽图像元素
from selenium import webdriver
from selenium.webdriver import ActionChains
browser = webdriver.Chrome()
url = "http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable"
browser.get(url)
browser.switch_to.frame('iframeResult')
source = browser.find_element_by_css_selector('#draggable')
target = browser.find_element_by_css_selector('#droppable')
actions = ActionChains(browser)
actions.drag_and_drop(source, target)
actions.perform()
5、执行JS:模拟人工在知乎上下拉滚动条到页面底部
from selenium import webdriver
browser = webdriver.Chrome()
browser.get("http://www.zhihu.com/explore")
browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
browser.execute_script('alert("To Bottom")')
6、获取节点信息:知乎首页中“提问”按钮
from selenium import webdriver
browser = webdriver.Chrome()
url = 'https://www.zhihu.com/explore'
browser.get(url)
input = browser.find_element_by_class_name('zu-top-add-question')
print(input.id)
print(input.location)
print(input.tag_name)
print(input.size)
7、隐式等待
from selenium import webdriver
browser = webdriver.Chrome()
browser.implicitly_wait(10)
browser.get('https://www.zhihu.com/explore')
input = browser.find_element_by_class_name('zu-top-add-question')
print(input)
8、显式等待
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
browser = webdriver.Chrome()
browser.get('https://www.taobao.com/')
wait = WebDriverWait(browser, 10)
input = wait.until(EC.presence_of_element_located((By.ID, 'q')))
button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.btn-search')))
print(input, button)
9、切换Frame
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
browser = webdriver.Chrome()
url = 'http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable'
browser.get(url)
browser.switch_to.frame('iframeResult')
try:
logo = browser.find_element_by_class_name('logo')
except NoSuchElementException:
print('NO LOGO in iframeResult')
browser.switch_to.parent_frame()
try:
logo = browser.find_element_by_class_name('logo')
print(logo)
print(logo.text)
except NoSuchElementException:
print('NO LOGO in parent_frame')
10、前进与后退
from selenium import webdriver
import time
browser = webdriver.Chrome()
browser.get('https://www.baidu.com/')
browser.get('https://www.taobao.com/')
browser.get('https://www.zhihu.com/')
browser.back()
time.sleep(1)
browser.forward()
browser.close()
11、浏览器的选项卡操作
from selenium import webdriver
import time
browser = webdriver.Chrome()
browser.get('https://www.baidu.com')
browser.execute_script('window.open()')
print(browser.window_handles)
browser.switch_to.window(browser.window_handles[1])
browser.get('https://www.taobao.com')
time.sleep(1)
browser.switch_to.window(browser.window_handles[0])
browser.get('https://www.zhihu.com/')
二、爬取京东商品评论
1、爬取京东的商品评论——通过打开界面浏览器
from selenium import webdriver
from urllib.parse import quote
driver = webdriver.Chrome() # 打开浏览器
key = '红酒' # 设置搜索商品关键词
url = 'https://search.jd.com/Search?keyword=' + quote(key) + '&enc=utf-8' # 构造url
driver.get(url) # 打开url
driver.implicitly_wait(3) # 等待
links = driver.find_elements_by_xpath('//*[@id="J_goodsList"]/ul/li/div/div[3]/a') # 查找当前页面的商品链接
urls = [l.get_attribute('href') for l in links]
url = urls[1] # 获取第一个商品链接
driver.get(url) # 打开页面
driver.find_element_by_xpath('//*[@id="detail"]/div[1]/ul/li[5]').click() # 点击商品评论
# 获取评论数据
comment_list = driver.find_elements_by_xpath('//*[@id="comment-0"]//div/div[2]/p')
comment_text_list = [c.text for c in comment_list]
driver.find_element_by_link_text('下一页').click() # 点击下一页评论
driver.close()
2、爬取京东的商品评论——通过无界面浏览器
from selenium import webdriver
from urllib.parse import quote
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
driver = webdriver.Chrome(chrome_options=chrome_options)
key = '红酒' # 设置搜索商品关键词
url = 'https://search.jd.com/Search?keyword=' + quote(key) + '&enc=utf-8' # 构造url
driver.get(url) # 打开url
driver.implicitly_wait(3) # 等待
links = driver.find_elements_by_xpath('//*[@id="J_goodsList"]/ul/li/div/div[3]/a') # 查找当前页面的商品链接
urls = [l.get_attribute('href') for l in links]
url = urls[1] # 获取第一个商品链接
driver.get(url) # 打开页面
driver.find_element_by_xpath('//*[@id="detail"]/div[1]/ul/li[5]').click() # 点击商品评论
# 获取评论数据
comment_list = driver.find_elements_by_xpath('//*[@id="comment-0"]//div/div[2]/p')
comment_text_list = [c.text for c in comment_list]
# driver.find_element_by_link_text('下一页').click() # TODO 报错:Message: no such element: Unable to locate element: {"method":"link text","selector":"下一页"}
3、爬取京东的商品评论——通过封装函数的形式
from selenium import webdriver
from urllib.parse import quote
import pandas as pd
from selenium.common.exceptions import StaleElementReferenceException
def get_page_comment(driver):
try:
content = driver.find_elements_by_xpath('//*[@id="comment-0"]//div/div[2]/p')
content_list = [c.text for c in content]
except StaleElementReferenceException as msg:
print(u"get_page_comment异常%s" % msg)
print(u"重新get_page_comment")
content = driver.find_elements_by_xpath('//*[@id="comment-0"]//div/div[2]/p')
content_list = [c.text for c in content]
return content_list
def get_page_all_comment(driver, i):
all_content = get_page_comment(driver)
while True:
try:
driver.find_element_by_link_text('下一页').click()
all_content = all_content + get_page_comment(driver)
except:
print("没有下一页了 - " + str(i)) # TODO 点击下一页,获取失败,待优化
break
return all_content
def get_all_comment(urls, driver, outpath='D:/DataguruPyhton/PythonSpider/images/'):
i = 0
for url in urls:
i += 1
driver.get(url)
driver.find_element_by_xpath('//*[@id="detail"]/div[1]/ul/li[5]').click() # 点击商品详情
name = driver.find_element_by_xpath('/html/body/div[8]/div/div[2]/div[1]').text
print("文件%d - %s" % (i, name))
comment = get_page_all_comment(driver, i)
comment = pd.DataFrame(comment)
comment.to_csv(outpath + str(i) + '.csv')
return None
def get_links(key, driver):
url = 'https://search.jd.com/Search?keyword=' + quote(key) + '&enc=utf-8' # 构造url
driver.get(url) # 打开url
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)') # 滚动到页面底部
driver.implicitly_wait(3) # 等待
links = driver.find_elements_by_xpath('//*[@id="J_goodsList"]/ul/li/div/div[1]/a') # 查找当前页面的商品链接
urls = [l.get_attribute('href') for l in links]
return urls
def main(key):
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless') # 设置headless模型
driver = webdriver.Chrome(chrome_options=chrome_options)
urls = get_links(key, driver)
get_all_comment(urls, driver, outpath='D:/DataguruPyhton/PythonSpider/images/')
main('红酒')
三、本篇文章中的代码,运行环境
- python 3.6.4
- selenium 3.8.0
- goole chrome 68.0.3440.106(正式版本) (64 位)
- chromedriver.exe