工作生活

利用selenium爬取QQ空间留言上

2019-07-04  本文已影响0人  八盖
这部分是先找到留言板页面,方便后期循环
import re
import time

import pymongo as pymongo
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome import options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq


option = webdriver.ChromeOptions()
option.add_argument('--headless')

browser = webdriver.Chrome(executable_path="C:\\Program Files (x86)\\Google\\Chrome\\Application\\chromedriver.exe", options=option)
# 等待时间
wait = WebDriverWait(browser, 100)

def get_message_page():
    try:
        browser.get('https://i.qq.com/')
        # 跳出到登录框中的frame标签,不然一直在开头快速登录的标签中
        wait.until(
            EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, '#login_frame'))
        )
        # 找到账号密码登录的按钮
        login = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, '#switcher_plogin'))
        )
        login.click()
        # 输入账号密码
        username = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#u'))
        )
        password = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#p'))
        )
        username.clear()
        username.send_keys('**********')
        password.send_keys('***********')
        # 点击登录
        submit = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, '#login_button'))
        )
        submit.click()
        # wait.until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR, 'html')))
        # 找到留言板链接
        message = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#menuContainer > div > ul > li.menu_item_334.cur > a'))
        )
        message.click()

        # 滚到页面底端
        # browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        # time.sleep(1)
        # wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#app_mod > div.gb_comment > div.bor.other')))

        # 找到页码标签
        total = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#pager_bottom > div > p.mod_pagenav_main > span.mod_pagenav_count > a:nth-child(8)'))
        )
        # 返回页码文本数据
        return total.text
    except TimeoutException:
        return get_message_page()


if __name__ == '__main__':
    get_message_page()

需要注意的有几点:

1.点击账号密码登录之后需要利用expected_conditions里面的frame_to_be_available_and_switch_to_it方法,先跳到父节点,再在父节点的框架里面找输入框,不然会一直停留在账号密码登录的那个框里面,是找不到输入框的。

2.添加try-except 当访问超时后返回函数重新访问

上一篇 下一篇

猜你喜欢

热点阅读