我爱编程

python Selenium 借助浏览器抓包

2018-03-01  本文已影响1150人  proud2008

安装
pip install selenium

from selenium import webdriver
browser = webdriver.Chrome()
browser.get('https://qiang.taobao.com/')

# 导入selenium模块中的web引擎
from selenium import webdriver


# 建立浏览器对象 ,通过Phantomjs
browser = webdriver.Chrome()

# 设置访问的url
url = 'https://www.baidu.com'

# 访问url
browser.get(url)

# 等待一定时间,让js脚本加载完毕
browser.implicitly_wait(3)

# 找到搜索框
text = browser.find_element_by_id('kw')

# 清空搜索框的文字
text.clear()

# 填写搜索框的文字
text.send_keys('python')

# 找到submit按钮
button = browser.find_element_by_id('su')

# 点击按钮 提交搜索请求
button.submit()


# 查看当前浏览器标题
print(browser.title)

# 以截图的方式查看浏览器的页面
browser.save_screenshot('text.png')

# 找到结果 结果保存为列表变量
results = browser.find_elements_by_class_name('t')

# 循环遍历找出每个结果的标题和url
for result in results:
    print('标题:{} 超链接:{}'.format(result.text,
                                result.find_element_by_tag_name('a').get_attribute('href')))

问题
1、Error message: “'chromedriver' executable needs to be available in the path”
https://sites.google.com/a/chromium.org/chromedriver/downloads下载chromedriver
将chromedriver.exe 放入放到python脚本的文件夹下面
或者webdriver.Chrome() 参数中指定全路径

文档

http://selenium-python-zh.readthedocs.io/en/latest/page-objects.html

与scrapy配合使用
https://github.com/clemfromspace/scrapy-selenium

"""This module contains the ``SeleniumMiddleware`` scrapy middleware"""

from importlib import import_module
from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.http import HtmlResponse
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
"""This module contains the ``SeleniumRequest`` class"""

from scrapy import Request


class SeleniumRequest(Request):
    """Scrapy ``Request`` subclass providing additional arguments"""

    def __init__(self, url, wait_time=None, wait_until=None, screenshot=False, *args, **kwargs):
        """Initialize a new selenium request

        Parameters
        ----------
        wait_time: int
            The number of seconds to wait.
        wait_until: method
            One of the "selenium.webdriver.support.expected_conditions". The response
            will be returned until the given condition is fulfilled.
        screenshot: bool
            If True, a screenshot of the page will be taken and the data of the screenshot
            will be returned in the response "meta" attribute.

        """

        self.wait_time = wait_time
        self.wait_until = wait_until
        self.screenshot = screenshot

        super().__init__(url, *args, **kwargs)




class SeleniumMiddleware:
    """Scrapy middleware handling the requests using selenium"""

    def __init__(self):
        self.driver = webdriver.Chrome()

    @classmethod
    def from_crawler(cls, crawler):
        """Initialize the middleware with the crawler settings"""
        middleware = cls()
        crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
        return middleware

    def process_request(self, request, spider):
        """Process a request using the selenium driver if applicable"""

        if not isinstance(request, SeleniumRequest):
            return request

        self.driver.get(request.url)

        for cookie_name, cookie_value in request.cookies.items():
            self.driver.add_cookie(
                {
                    'name': cookie_name,
                    'value': cookie_value
                }
            )

        if request.wait_until:
            WebDriverWait(self.driver, request.wait_time).until(
                request.wait_until
            )

        if request.screenshot:
            request.meta['screenshot'] = self.driver.get_screenshot_as_png()

        body = str.encode(self.driver.page_source)

        # Expose the driver via the "meta" attribute
        request.meta.update({'driver': self.driver})

        return HtmlResponse(
            self.driver.current_url,
            body=body,
            encoding='utf-8',
            request=request
        )

    def spider_closed(self):
        """Shutdown the driver when spider is closed"""

        self.driver.quit()

return SeleniumRequest(url) 返回
settings.py DOWNLOADER_MIDDLEWARES中添加 该SeleniumMiddleware

上一篇 下一篇

猜你喜欢

热点阅读