黑板课爬虫闯关 - 第四关
2018-11-27 本文已影响12人
MA木易YA
- 第四关增加了登录验证和密码获取,主页面如下:
- 但是想要进入主页面需要先登录:
- 进去后可以随便试下密码,果然错误,然后系统提供了一个密码表
-
密码表
image.png - 然后就是繁杂的解密过程了,流程和之前其实差不多,同样的模拟登陆,不同的是这里增加了密码获取这一条,开始做的时候也是很懵逼,后来在网上大佬的提点下知道密码是有100位QAQ,而且页面加载极其之慢,所以这个过程非常煎熬
关卡主要是想考察模拟登陆和多线程爬虫这一块,这样比较快嘛,但是也是可以暴力破解的嘛,下面提供三个版本参考
one.py(单线程,直接获取密码组合到100位再进行测试)
import requests
from lxml import etree
import codecs
import csv
import re
se = requests.session()
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"
}
class HBK():
def __init__(self):
self.login_url = "http://www.heibanke.com/accounts/login"
self.username = "whaike"
self.password = "12345654321"
self.passwrods = ['' for i in range(101)]
self.pwd = ''
##获取登陆之前的csrf
def getCsrf(self):
res = se.get(url=self.login_url,headers=headers,timeout=30).text
tree = etree.HTML(res)
self.csrf = tree.xpath('/html/body/div/div/div[2]/form/input[@name="csrfmiddlewaretoken"]/@value')[0]
#登陆
def login(self):
self.getCsrf()
data = {
"csrfmiddlewaretoken":self.csrf,
"username":self.username,
"password":self.password
}
se.post(url=self.login_url,headers=headers,data=data,timeout=30)
print('登陆成功')
#获取登陆之后的csrf,也就是要进行第四关闯关的csrf
def getNCsrf(self):
url = 'http://www.heibanke.com/lesson/crawler_ex03/'
res = se.get(url,headers=headers,timeout=30).text
tree = etree.HTML(res)
csrf = tree.xpath('//input[1]/@value')[0]
return csrf
#猜测密码是否正确
def guesspwd(self):
url = 'http://www.heibanke.com/lesson/crawler_ex03/'
csrf = self.getNCsrf()
data = {
"csrfmiddlewaretoken":csrf,
"username":"whaike",
"password":self.pwd
}
res = se.post(url,headers=headers,data=data,timeout=30)
if int(res.status_code) == 200:
self.h3 = re.findall('<h3>(.*?)</h3>',res.text)
return True
else:
return False
#循环抓取第一页的随机值,直到密码长度为100时开始猜测,猜测失败继续执行,猜测成功停止运行
def getGasswords(self):
print('获取第一页')
url = 'http://www.heibanke.com/lesson/crawler_ex03/pw_list/?page=1'
res = se.get(url,headers=headers,timeout=30).text
tree = etree.HTML(res)
trs = tree.xpath('/html/body/div/div/div[2]/table/tr')[1:]
for tr in trs:
p1 = tr.xpath('td[1]/text()')[0] #位置
p = int(re.findall('\d+',p1)[0]) #偶尔数字前会有一些其他字符出现,提取数字部分,转换为整数
w = tr.xpath('td[2]/text()')[0] #值
self.passwrods[p] = w
self.pwd = ''.join(self.passwrods)
length = len(self.pwd) #密码长度
print('当前密码:%s,长度%d'%(self.pwd,length))
if length == 100:
print('满足条件,开始猜测...')
if self.guesspwd():
print ('猜测成功,密码为:%s'%self.pwd)
else:
print ('猜测失败,继续执行')
self.getGasswords()
else: #如果密码长度不为100,则再次获取第一页的随机密码并组成新的密码
self.getGasswords() #递归
if __name__ == '__main__':
print('开始闯关 - 第四关')
spider = HBK()
spider.login()
spider.getGasswords()
print(spider.h3)
two.py(多线程版,转自网上某大佬)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date : 2017-09-02 22:25:21
# @Author : bb (317716008@qq.com)
# @Word : python can change world!
# @Version : python3.6
import requests
from bs4 import BeautifulSoup
import threading
from queue import Queue
dict1={}
vlauess=[]
web1="http://www.heibanke.com/accounts/login"
web2="http://www.heibanke.com/lesson/crawler_ex03/pw_list/"
web3="http://www.heibanke.com/lesson/crawler_ex03/"
global queuewz
global queuemm
queuewz=Queue()
queuemm=Queue()
class mythreads(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
def run(self):
work()
while not queuemm.empty():
try:
dict1[str(queuewz.get())]=queuemm.get()
print(dict1)
print("字典长度为%s"%len(dict1))
if int(len(dict1)) ==100:
print("凑到100啦!")
for i in range(1,101):
vlauess.append(dict1[str(i)])
c=vlauess[:100]
zzmm=''.join(c)
print("密码为%s"%zzmm)
print("正在登录.......")
dataWebsite1 = {'username': 'user','password': zzmm}
s=login_get()
res=s.post(web3, data=dataWebsite1).text
if u'恭喜' in res:
title=re.findall("<title>(.*?)</title>",res)
word=re.findall("<h1>(.*?)</h1>",res)
word2=re.findall("<h3>(.*?)</h3>",res)
html=re.findall('<a href="(.*?)" class="btn btn-primary">下一关</a>',res)
print('\n'.join([title[0], word[0], word2[0],'下一关地址是','http://www.heibanke.com'+html[0]]))
break
else:
print("网页有问题哦!可以尝试手动将获得的正确密码登入进去哦!")
break
else:
main()
except IndexError:
print("例表空了,下一页!")
def login_get():
try:
s = requests.Session()
r=s.get(web1) # 访问登录页面获取登录要用的csrftoken
token1 = r.cookies['csrftoken'] # 保存csrftoken
# 将csrftoekn存入字段csrfmiddlewaretoken
dataWebsite1 = {'username': 'user',
'password': 'password',
'csrfmiddlewaretoken': token1
}
res=s.post(web1, data=dataWebsite1)
except KeyError as e:
pass
return s
def get_html(s):
r=s.get(web2)
res=r.text
return res
def get_dict(res):
soup=BeautifulSoup(res,"html.parser")
for a in soup.find_all('td',attrs={'title':'password_pos'}):
wz=(a.string)
queuewz.put(wz)
for b in soup.find_all('td',attrs={'title':'password_val'}):
mm=(b.string)
queuemm.put(mm)
def work():
res=get_html(s)
get_dict(res)
def main():
global s
s=login_get()
threads=[]
threads_count=10
for i in range(threads_count):
threads.append(mythreads())
for t in threads:
t.start()
for t in threads:
t.join()
if __name__ == '__main__':
main()
three.py
import re
import requests
from threading import Thread
import time
def print_run_time(func):
"""
装饰器函数,输出运行时间
"""
def wrapper(self, *args, **kw):
local_time = time.time()
# print args),kw
func(self)
print('run time is {:.2f}:'.format(time.time() - local_time))
return wrapper
class hbk_crawler(object):
"""黑板客爬虫闯关"""
def __init__(self): pass
def login(self):
"""登录函数 input:第几关"""
self.url = 'http://www.heibanke.com/lesson/crawler_ex03'
self.login_url = 'http://www.heibanke.com/accounts/login/?next=/lesson/crawler_ex03'
self.s = requests.session()
print("正在登录第4关....")
try:
self.csrftoken = self.s.get(self.login_url).cookies['csrftoken']
except:
print("网络连接错误,请重试...")
exit()
self.payload = {'username': 'test', 'password': 'test123',
'csrfmiddlewaretoken': self.csrftoken}
self.payload['csrfmiddlewaretoken'] = self.s.post(
self.login_url, self.payload).cookies['csrftoken']
print("登录成功....")
return None
def parseurl(self, url):
"""分析网页,查找密码位置和值"""
while self.count < 100:
response = self.s.get(url)
if response.ok:
content = response.text
pos_pattern = r'_pos.>(.*)</td>'
val_pattern = r'_val.>(.*)</td>'
pos_list = re.findall(pos_pattern, content)
val_list = re.findall(val_pattern, content)
for pos, val in zip(pos_list, val_list):
if pos not in self.pw_dict:
self.pw_dict[pos] = val
self.count = self.count + 1
print(str(self.count) + '%' + self.count // 2 * '*')
def ex04(self, *args, **kw):
""" 第4关:找密码,加入了登录验证,CSRF保护,密码长度100位,响应时间增加 """
self.count = 0
self.login()
self.pw_dict = {}
pw_url = ('http://www.heibanke.com/lesson/crawler_ex03/pw_list',)
# 线程数,黑板客服务器15秒内最多响应2个请求,否则返回404.
n = 2
threads = [Thread(target=self.parseurl, args=(
pw_url)) for i in range(n)]
for t in threads:
print(t.name, 'start...')
t.start()
for t in threads:
t.join()
self.pw_list = ['' for n in range(101)]
for pos in self.pw_dict.keys():
self.pw_list[int(pos)] = self.pw_dict[pos]
password = int(''.join(self.pw_list))
self.payload['password'] = password
response = self.s.post(self.url, self.payload)
pattern = r'<h3>(.*)</h3>'
result = re.findall(pattern, response.text)
result2 = re.findall('<a href="(.*?)" class="btn btn-primary">下一关</a>',response.text)
print(result[0])
print(result2)
if __name__ == '__main__':
Hbk_crawler = hbk_crawler()
Hbk_crawler.ex04()
综合总结下来,几种方法原理都差不多,主要是多线程在数据获取上速度会快一些,关于多线程的板块,后面会进行更新,大家也可以去看看官方文档或者廖雪峰的教程,然后这里每次都要模拟登陆比较麻烦,注意代码不要冗余,在获取错误信息方面基本都差不多,用的re抓取,如果"text" in XXX的方式不适用的话,可以尝试构建result为抓取的错误信息,返回值为空则为正确,有返回值则说明有错误信息,页面加载比较慢,建议给点输出信息以免你觉得代码挂了
- 更多代码详情参考我的Github