动态爬虫之QQ空间登录
准备:
1、intellij idea
2、python
3、selenium
4、phantomJs
1、分析Qzone Html页面
打开手机版qzone https://mobile.qzone.qq.com
qzone_openhtml.jpg<color style="color:red">按照上面流程复制账号、密码和登录按钮的的XPath粘贴到记事本中</color>
2、超链
1、构建浏览器并且设置请求头
2、开始请求
3、模仿用户输入
4、输入验证码
5、自动登录
6、完整代码
<a id='1'></a>
3、编写爬虫代码
首先创建一个浏览器对象和设置请求头
# 导入驱动包
from selenium import webdriver
from selenium.webdriver import DesiredCapabilities
class qzone_dlewares(object):
# 浏览器请求头
headers = {'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.8',
'Cache-Control': 'max-age=0',
'User-Agent': 'Mozilla/5.0 (Linux; U; Android 2.3.6; zh-cn; GT-S5660 Build/GINGERBREAD) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1 MicroMessenger/4.5.255',
'Connection': 'keep-alive', }
#初始化浏览器
def __init__(self,userName='' ,password = '', *args, **kwargs):
self.userName = userName
self.password = password
desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
for key, value in self.headers.items():
desired_capabilities['phantomjs.page.customHeaders.{}'.format(key)] = value
self.driver = webdriver.PhantomJS(desired_capabilities=desired_capabilities)
# 设置屏幕大小
self.driver.set_window_size(414, 736)
<a id='2' ></a>
开始请求 截图
def startQzoneRequest(self):
#开始请求qzone
self.driver.get('https://mobile.qzone.qq.com')
#截图保存到当前项目下
self.driver.save_screenshot('screenshot.png')
截图成功会在 project 下面生成screenshot.png
qzone_open_mobile.png
<a id='3' ></a>
模仿用户输入 登录关键性代码
import time
from selenium.webdriver import ActionChains
def loginQzone(self):
u = self.driver.find_element_by_xpath('//*[@id="u"]')
p = self.driver.find_element_by_xpath('//*[@id="p"]')
go = self.driver.find_element_by_xpath('//*[@id="go"]')
# 移动到账号框模仿键盘输入账号
action = ActionChains(self.driver)
action.move_to_element(u)
action.click(u)
# 模仿键盘输入账号
action.send_keys(self.userName)
# 移动到密码输入框
action.move_to_element(p)
action.click(p)
# 模仿键盘输入密码
action.send_keys(self.password)
# 点击登录
action.move_by_offset(go.location['x'], go.location['y'])
action.click(go)
# 执行登录
action.perform()
# 休息1秒保证能执行
time.sleep(1)
# 截图保存到当前项目下
self.driver.save_screenshot('screenshotLoginQzone.png')
登录代码就写完了现在开始写个测试代码
if __name__ == '__main__':
# 事先输入账号和密码
userName = input("账号:")
password = input("密码:")
oldTime = time.time()
browser = qzone_dlewares(userName=userName, password=password)
initTime = time.time()
# 打开浏览器并且截图
browser.startQzoneRequest()
requestTime = time.time()
# 模仿用户登录
browser.loginQzone()
currentTime = time.time()
print('开始时间 %f' % oldTime)
print('结束时间 %f' % currentTime)
print('初始化时间 %f' % (initTime - oldTime))
print('加载页面时间 %f'%(requestTime - initTime))
print('模仿操作时间 %f' %(currentTime - requestTime))
print('总运行时间 %f' % (currentTime - oldTime))
运行测试结果
qzon_runtime_date.png qzon_screenshot_verify_login_success.jpg运行几遍后发现每次都要登录一遍,然后腾讯验证码也出来了。。。
先把验证码这块给处理了
按照开始寻找图片的方法把验证码图片、验证码输入框、按钮找出来
from selenium.webdriver import ActionChains
def check_code(self):
que_code = self.driver.find_element_by_xpath('//*[@id="cap_que_img"]')
que_input = self.driver.find_element_by_xpath('//*[@id="cap_input"]')
que_but = self.driver.find_element_by_xpath('//*[@id="verify_btn"]')
#保存验证码
self.save_verify_code(que_code)
#输入验证码
input_verify_code = input("验证码:")
#模仿用户输入
action = ActionChains(self.driver)
action.move_to_element(que_input)
action.click()
action.send_keys(input())
action.move_to_element(que_but)
action.click()
#执行
action.perform()
保存验证码
import urllib
def save_verify_code(self,element):
url = element.get_attribute('src')
fileName = element.get_attribute('id') + '.jpg'
urllib.request.urlretrieve(url, fileName)
运行测试,发现以下错误<p style="color:red;">Traceback (most recent call last):
File "C:/Users/user/IdeaProjects/untitled/untitled/qzone.py", line 108, in <module>
browser.check_code2()
File "C:/Users/user/IdeaProjects/untitled/untitled/qzone.py", line 67, in check_code2
que_code = self.driver.find_element_by_xpath('//[@id="cap_que_img"]')
File "D:\python\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 313, in find_element_by_xpath
return self.find_element(by=By.XPATH, value=xpath)
File "D:\python\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 791, in find_element
'value': value})['value']
File "D:\python\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 256, in execute
self.error_handler.check_response(response)
File "D:\python\lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 194, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.NoSuchElementException: Message: {"errorMessage":"Unable to find element with xpath '//[@id="cap_que_img"]'","request":{"headers":{"Accept":"application/json","Accept-Encoding":"identity","Connection":"close","Content-Length":"108","Content-Type":"application/json;charset=UTF-8","Host":"127.0.0.1:53613","User-Agent":"Python http auth"},"httpVersion":"1.1","method":"POST","post":"{"using": "xpath", "value": "//*[@id=\"cap_que_img\"]", "sessionId": "737e6b90-6929-11e7-8958-3b746283f061"}","url":"/element","urlParsed":{"anchor":"","query":"","file":"element","directory":"/","path":"/element","relative":"/element","port":"","host":"","password":"","user":"","userInfo":"","authority":"","protocol":"","source":"/element","queryKey":{},"chunks":["element"]},"urlOriginal":"/session/737e6b90-6929-11e7-8958-3b746283f061/element"}}
Screenshot: available via screen</p>
说没有找到这个节点,后来分析源码发现他是包在 iframe 中的 既然包在里面了那我们切换窗口好了
# 校验码
from selenium.common.exceptions import NoSuchElementException
def check_code(self):
# 先切换到默认的窗口
self.driver.switch_to.default_content()
iframe = None
try:
# 验证码
iframe = self.driver.find_element_by_xpath('//*[@id="new_vcode"]/iframe[2]')
except NoSuchElementException:
print('无需输入验证码')
else:
self.driver.switch_to.frame(iframe)
self.verify_code()
手动输入验证码,暂时还不能自动输入验证码 并且也不能验证验证码是否错误或者切换
# 验证码
def verify_code(self):
que_code = self.driver.find_element_by_xpath('//*[@id="cap_que_img"]')
que_input = self.driver.find_element_by_xpath('//*[@id="cap_input"]')
que_but = self.driver.find_element_by_xpath('//*[@id="verify_btn"]')
# 保存验证码
self.save_verify_code(que_code)
verify_path = que_code.get_attribute('id') + '.jpg'
# 输入验证码
if (self.isWindows()):
os.startfile(verify_path)
else:
os.subprocess.call(["xdg-open", verify_path])
input_verify_code = input("验证码:")
# 模仿用户输入
action = ActionChains(self.driver)
action.move_to_element(que_input)
action.click()
action.send_keys(input_verify_code)
action.move_to_element(que_but)
action.click()
# 执行
action.perform()
完美运行登录成功
qzon_screenshot_verify_code.jpg qzon_screenshot_verify_login_success.jpg<a id='5'></a>
<p></p>
每次运行都需要手动登录太麻烦了 qzone 保存cookies 好像可以不需要手动登录了
# 保存登录 cookies
def save_cookies(self):
with open(self.hashCode(), 'wb') as f:
obj = self.driver.get_cookies()
pickle.dump(obj, f)
f.close()
# 读取并设置 cookies
def load_cookies(self):
fileName = self.hashCode()
# 判断文件是否存在
if self.file_exists(fileName):
f = open(fileName, 'rb')
obj = pickle.load(file=f)
f.close()
# 循环设置 cookie
try:
for cookie in obj:
self.driver.add_cookie(cookie)
except Exception as e:
print(e)
# hasCode
def hashCode(self):
sha = sha1()
sha.update(b'qzone_cookies')
return sha.hexdigest()
# 判断文件是否存在
def file_exists(self, filename):
try:
with open(filename) as f:
return True
except IOError:
return False
测试代码
if __name__ == '__main__':
# 事先输入账号和密码
userName = input("账号:")
password = input("密码:")
oldTime = time.time()
browser = qzone_dlewares(userName=userName, password=password)
# 加载cookies
browser.load_cookies()
initTime = time.time()
# 打开浏览器并且截图
browser.startQzoneRequest()
requestTime = time.time()
# 判断是否登录
if (not browser.isLogin()):
# 模仿用户登录
browser.loginQzone()
# 检查code
browser.check_code()
currentTime = time.time()
# 解析动态
browser.paresHtml()
# 运行完成后再截图一次
browser.driver.save_screenshot('screenshotLoginQzoneSuccess.png')
# 保存cookies
browser.save_cookies()
print('开始时间 %f' % oldTime)
print('结束时间 %f' % currentTime)
print('初始化时间 %f' % (initTime - oldTime))
print('加载页面时间 %f' % (requestTime - initTime))
print('模仿操作时间 %f' % (currentTime - requestTime))
print('总运行时间 %f' % (currentTime - oldTime))
<a id='6'></a>
qzone 登录自动登录完整代码
import json
import os
import pickle
import platform
import time
import urllib
from _sha1 import sha1
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import DesiredCapabilities, ActionChains
class qzone_dlewares(object):
# 浏览器请求头
headers = {'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.8',
'Cache-Control': 'max-age=0',
'User-Agent': 'Mozilla/5.0 (Linux; U; Android 2.3.6; zh-cn; GT-S5660 Build/GINGERBREAD) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1 MicroMessenger/4.5.255',
'Connection': 'keep-alive', }
# 初始化浏览器
def __init__(self, userName='', password='', *args, **kwargs):
self.userName = userName
self.password = password
desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
for key, value in self.headers.items():
desired_capabilities['phantomjs.page.customHeaders.{}'.format(key)] = value
# 禁止加载图片
desired_capabilities["phantomjs.page.settings.loadImages"] = False
self.driver = webdriver.PhantomJS(desired_capabilities=desired_capabilities)
# 设置屏幕大小
self.driver.set_window_size(414, 736)
# 开始请求并且截图
def startQzoneRequest(self):
# 开始请求qzone
self.driver.get('https://mobile.qzone.qq.com')
# 截图保存到当前项目下
self.driver.save_screenshot('screenshot.png')
# 判断是否登录了
def isLogin(self):
try:
u = self.driver.find_element_by_xpath('//*[@id="u"]')
p = self.driver.find_element_by_xpath('//*[@id="p"]')
go = self.driver.find_element_by_xpath('//*[@id="go"]')
except NoSuchElementException:
return True
return False
def loginQzone(self):
u = self.driver.find_element_by_xpath('//*[@id="u"]')
p = self.driver.find_element_by_xpath('//*[@id="p"]')
go = self.driver.find_element_by_xpath('//*[@id="go"]')
# 清理账号和密码
u.clear()
p.click()
# 移动到账号框模仿键盘输入账号
action = ActionChains(self.driver)
action.move_to_element(u)
action.click(u)
# 模仿键盘输入账号
action.send_keys(self.userName)
# 移动到密码输入框
action.move_to_element(p)
action.click(p)
# 模仿键盘输入密码
action.send_keys(self.password)
# 点击登录
action.move_by_offset(go.location['x'], go.location['y'])
action.click(go)
# 执行登录
action.perform()
# 休息1秒保证能执行
time.sleep(1)
# 截图保存到当前项目下
self.driver.save_screenshot('screenshotLoginQzone.png')
def save_verify_code(self, element):
url = element.get_attribute('src')
fileName = element.get_attribute('id') + '.jpg'
urllib.request.urlretrieve(url, fileName)
# 校验码
def check_code(self):
# 先切换到默认的窗口
self.driver.switch_to.default_content()
iframe = None
try:
# 验证码
iframe = self.driver.find_element_by_xpath('//*[@id="new_vcode"]/iframe[2]')
except NoSuchElementException:
print('无需输入验证码')
else:
self.driver.switch_to.frame(iframe)
self.verify_code()
# 验证码
def verify_code(self):
que_code = self.driver.find_element_by_xpath('//*[@id="cap_que_img"]')
que_input = self.driver.find_element_by_xpath('//*[@id="cap_input"]')
que_but = self.driver.find_element_by_xpath('//*[@id="verify_btn"]')
# 保存验证码
self.save_verify_code(que_code)
verify_path = que_code.get_attribute('id') + '.jpg'
# 输入验证码
if (self.isWindows()):
os.startfile(verify_path)
else:
os.subprocess.call(["xdg-open", verify_path])
input_verify_code = input("验证码:")
# 模仿用户输入
action = ActionChains(self.driver)
action.move_to_element(que_input)
action.click()
action.send_keys(input_verify_code)
action.move_to_element(que_but)
action.click()
# 执行
action.perform()
# 解析动态
def paresHtml(self):
pass
# 是 windows 系统
def isWindows(self):
sysstr = platform.system()
if (sysstr == "Windows"):
return True
return False
# 保存登录 cookies
def save_cookies(self):
with open(self.hashCode(), 'wb') as f:
obj = self.driver.get_cookies()
pickle.dump(obj, f)
f.close()
# 读取并设置 cookies
def load_cookies(self):
fileName = self.hashCode()
# 判断文件是否存在
if self.file_exists(fileName):
f = open(fileName, 'rb')
obj = pickle.load(file=f)
f.close()
# 循环设置 cookie
try:
for cookie in obj:
self.driver.add_cookie(cookie)
except Exception as e:
print(e)
# hasCode
def hashCode(self):
sha = sha1()
sha.update(b'qzone_cookies')
return sha.hexdigest()
# 判断文件是否存在
def file_exists(self, filename):
try:
with open(filename) as f:
return True
except IOError:
return False
# 退出浏览器
def __del__(self):
self.driver.quit()
if __name__ == '__main__':
# 事先输入账号和密码
userName = input("账号:")#
password = input("密码:")#
oldTime = time.time()
browser = qzone_dlewares(userName=userName, password=password)
# 加载cookies
browser.load_cookies()
initTime = time.time()
# 打开浏览器并且截图
browser.startQzoneRequest()
requestTime = time.time()
# 判断是否登录
if (not browser.isLogin()):
# 模仿用户登录
browser.loginQzone()
# 检查code
browser.check_code()
currentTime = time.time()
# 解析动态
browser.paresHtml()
# 运行完成后再截图一次
browser.driver.save_screenshot('screenshotLoginQzoneSuccess.png')
# 保存cookies
browser.save_cookies()
print('开始时间 %f' % oldTime)
print('结束时间 %f' % currentTime)
print('初始化时间 %f' % (initTime - oldTime))
print('加载页面时间 %f' % (requestTime - initTime))
print('模仿操作时间 %f' % (currentTime - requestTime))
print('总运行时间 %f' % (currentTime - oldTime))
总结
1、QQ空间登录其实可用使用js来模仿用户操作直接输入代码量也很少
2、然后这边也有一个写入cookies 的bug因为作用域不对会报错
我只是一只小菜鸟,如果你看到代码有错误的地方请提出来