爬虫验证码之--微博宫格验证码的识别

2018-07-04 本文已影响0人 strive鱼
本文要识别的验证码类型为宫格验证，主要以微博为主，多次登陆微博（https://passport.weibo.cn/signin/login)则会出现该类验证码
本文的重点知识还是selenium 的使用
import os
import time
from io import BytesIO
from PIL import Image
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from os import listdir

USERNAME = 'xxxxxx'
PASSWORD = 'xxxxxxx'

TEMPLATES_FOLDER = 'templates/'


class CrackWeiboSlide():
    def __init__(self):
        self.url = 'https://passport.weibo.cn/signin/login?entry=mweibo&r=https://m.weibo.cn/'
        self.browser = webdriver.Chrome()
        self.wait = WebDriverWait(self.browser, 20)
        self.username = USERNAME
        self.password = PASSWORD
    
    def __del__(self):
        self.browser.close()
    
    def open(self):
        """
        打开网页输入用户名密码并点击
        :return: None
        """
        self.browser.get(self.url)
        username = self.wait.until(EC.presence_of_element_located((By.ID, 'loginName')))
        password = self.wait.until(EC.presence_of_element_located((By.ID, 'loginPassword')))
        submit = self.wait.until(EC.element_to_be_clickable((By.ID, 'loginAction')))
        username.send_keys(self.username)
        password.send_keys(self.password)
        submit.click()
    
    def get_position(self):
        """
        获取验证码位置
        :return: 验证码位置元组
        """
        try:
            img = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'patt-shadow')))
        except TimeoutException:
            print('未出现验证码')
            self.open()
        time.sleep(2)
        location = img.location
        size = img.size
        top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + size[
            'width']
        return (top, bottom, left, right)
    
    def get_screenshot(self):
        """
        获取网页截图
        :return: 截图对象
        """
        screenshot = self.browser.get_screenshot_as_png()
        screenshot = Image.open(BytesIO(screenshot))
        return screenshot
    
    def get_image(self, name='captcha.png'):
        """
        获取验证码图片
        :return: 图片对象
        """
        top, bottom, left, right = self.get_position()
        print('验证码位置', top, bottom, left, right)
        screenshot = self.get_screenshot()
        captcha = screenshot.crop((left, top, right, bottom))
        captcha.save(name)
        return captcha
    
    def is_pixel_equal(self,image1,image2,x,y):#判断刷新出来的验证码图片和已经存储好的验证码模板是否匹配
        pixel1=image1.load()[x,y]#返回的是RGB形式的元组
        pixel2=image2.load()[x,y]
        threshold=20#设定一个阈值
        if abs(pixel1[0] - pixel2[0]) < threshold and abs(pixel1[1] - pixel2[1]) < threshold and abs(
                pixel1[2] - pixel2[2]) < threshold:
            return True 
        else:
            return False
        
    def same_image(self,image,template):#第一个为待识别的验证码，第二个为存储的模板，只要是比对一下各个点的像素
            count=0#用于存储像素相同点的个数
            threshold=0.99#阈值，用于标定两张图表是否相同
            for i in range(image.width):
                for j in range(image.height):
                    if self.is_pixel_equal(image,template,i,j):#如果像素相同
                        count+=1
            result=float(count)/(image.width*image.height)#判断相同的像素点的占比
            if result>threshold:
                print ('成功匹配')
                return True 
            else：
                return False
        
    
     def detect_image(self,image):
         for template_name in listdir(TEMPLATES_FOLDER):
            print('正在匹配', template_name)
            template = Image.open(TEMPLATES_FOLDER + template_name)
            if self.same_image(image, template):
                # 返回顺序
                numbers = [int(number) for number in list(template_name.split('.')[0])]
                print('拖动顺序', numbers)
                return numbers
    
    def move(self,numbers):
        circles=self.browser.find_elements_by_css_selector('.patt-wrap .patt-circ')#获得的是四个按钮的列表
        dx=dy=0#初始一个原始的初始移动位置
        for index in range(4):
            circle=circles[numbers[index]-1]#numbers是1-4的一个列表
            # 如果是第一次循环
            if index == 0:
                # 点击第一个按点
                ActionChains(self.browser) \
                    .move_to_element_with_offset(circle, circle.size['width'] / 2, circle.size['height'] / 2) \
                    .click_and_hold().perform()
            else:
                # 小幅移动次数
                times = 30
                # 拖动
                for i in range(times):
                    ActionChains(self.browser).move_by_offset(dx / times, dy / times).perform()#当dx=dy=0的时候，index=0，不会执行这一句
                    time.sleep(1 / times)
            # 如果是最后一次循环
            if index == 3:
                # 松开鼠标
                ActionChains(self.browser).release().perform()
            else:
                # 计算下一次偏移
                dx = circles[numbers[index + 1] - 1].location['x'] - circle.location['x']
                dy = circles[numbers[index + 1] - 1].location['y'] - circle.location['y']
                
    def crack(self):
        """
        破解入口
        :return:
        """
        self.open()
        # 获取验证码图片
        image = self.get_image('captcha.png')
        numbers = self.detect_image(image)
        self.move(numbers)
        time.sleep(10)
        print('识别结束')
            
                    
            
        
    
     def main(self):
        count=0 
        while True:
            self.open()
            self.get_image(str(count)+'.png')
            count+=1
            
            
if __name__=='__main__':
    crack= CrackWeiboSlide()
    crack.crack()
爬虫验证码之--微博宫格验证码的识别

猜你喜欢

热点阅读