python Selenium 借助浏览器抓包
2018-03-01 本文已影响1150人
proud2008
安装
pip install selenium
from selenium import webdriver
browser = webdriver.Chrome()
browser.get('https://qiang.taobao.com/')
# 导入selenium模块中的web引擎
from selenium import webdriver
# 建立浏览器对象 ,通过Phantomjs
browser = webdriver.Chrome()
# 设置访问的url
url = 'https://www.baidu.com'
# 访问url
browser.get(url)
# 等待一定时间,让js脚本加载完毕
browser.implicitly_wait(3)
# 找到搜索框
text = browser.find_element_by_id('kw')
# 清空搜索框的文字
text.clear()
# 填写搜索框的文字
text.send_keys('python')
# 找到submit按钮
button = browser.find_element_by_id('su')
# 点击按钮 提交搜索请求
button.submit()
# 查看当前浏览器标题
print(browser.title)
# 以截图的方式查看浏览器的页面
browser.save_screenshot('text.png')
# 找到结果 结果保存为列表变量
results = browser.find_elements_by_class_name('t')
# 循环遍历找出每个结果的标题和url
for result in results:
print('标题:{} 超链接:{}'.format(result.text,
result.find_element_by_tag_name('a').get_attribute('href')))
问题
1、Error message: “'chromedriver' executable needs to be available in the path”
从https://sites.google.com/a/chromium.org/chromedriver/downloads下载chromedriver
将chromedriver.exe 放入放到python脚本的文件夹下面
或者webdriver.Chrome() 参数中指定全路径
文档
http://selenium-python-zh.readthedocs.io/en/latest/page-objects.html
与scrapy配合使用
https://github.com/clemfromspace/scrapy-selenium
"""This module contains the ``SeleniumMiddleware`` scrapy middleware"""
from importlib import import_module
from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.http import HtmlResponse
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
"""This module contains the ``SeleniumRequest`` class"""
from scrapy import Request
class SeleniumRequest(Request):
"""Scrapy ``Request`` subclass providing additional arguments"""
def __init__(self, url, wait_time=None, wait_until=None, screenshot=False, *args, **kwargs):
"""Initialize a new selenium request
Parameters
----------
wait_time: int
The number of seconds to wait.
wait_until: method
One of the "selenium.webdriver.support.expected_conditions". The response
will be returned until the given condition is fulfilled.
screenshot: bool
If True, a screenshot of the page will be taken and the data of the screenshot
will be returned in the response "meta" attribute.
"""
self.wait_time = wait_time
self.wait_until = wait_until
self.screenshot = screenshot
super().__init__(url, *args, **kwargs)
class SeleniumMiddleware:
"""Scrapy middleware handling the requests using selenium"""
def __init__(self):
self.driver = webdriver.Chrome()
@classmethod
def from_crawler(cls, crawler):
"""Initialize the middleware with the crawler settings"""
middleware = cls()
crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
return middleware
def process_request(self, request, spider):
"""Process a request using the selenium driver if applicable"""
if not isinstance(request, SeleniumRequest):
return request
self.driver.get(request.url)
for cookie_name, cookie_value in request.cookies.items():
self.driver.add_cookie(
{
'name': cookie_name,
'value': cookie_value
}
)
if request.wait_until:
WebDriverWait(self.driver, request.wait_time).until(
request.wait_until
)
if request.screenshot:
request.meta['screenshot'] = self.driver.get_screenshot_as_png()
body = str.encode(self.driver.page_source)
# Expose the driver via the "meta" attribute
request.meta.update({'driver': self.driver})
return HtmlResponse(
self.driver.current_url,
body=body,
encoding='utf-8',
request=request
)
def spider_closed(self):
"""Shutdown the driver when spider is closed"""
self.driver.quit()
return SeleniumRequest(url) 返回
settings.py DOWNLOADER_MIDDLEWARES中添加 该SeleniumMiddleware