request库

2019-07-28  本文已影响0人  叶扬风起

特点:requests库发送请求将网页内容下载下来以后,并不会执行js代码,这需要我们自己分析目标站点然后发起新的request请求

一、requests模块支持的请求方式:

import requests
r = requests.get('https://api.github.com/events')   
r = requests.post('http://httpbin.org/post', data = {'key':'value'})
r = requests.put('http://httpbin.org/put', data = {'key':'value'})
r = requests.delete('http://httpbin.org/delete')
r = requests.head('http://httpbin.org/get')
r = requests.options('http://httpbin.org/get')

二、requests发送GET请求

1、基本get请求
import requests
response=requests.get('http://dig.chouti.com/')
print(response.text)

 response查看response编码
respose.encoding:查看返回网页数据默认编码
import requests

url='https://www.baidu.com/'
respose=requests.get(
             url=url,
             headers={
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1;Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'
             })

print(respose.encoding)#查看网页编码
respose.encoding='utf-8' #设置网页编码
print(respose.status_code)
with open('a.html','w',encoding='utf-8') as f:
    f.write(respose.text)
2、带参数的GET请求
#带参数的url,+url编码
from urllib.parse import urlencode
import requests
k=input('输入关键字:  ').strip()
res=urlencode({'wd':k},encoding='utf-8')  #url编码
respose=requests.get('https://www.baidu.com/s?%s'% res,
                     headers={
                    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'
                     },
                     # params={'wd':k}


                     )
with open('a.html','w',encoding='utf-8') as f:
    f.write(respose.text)
respose=requests.get('https://www.baidu.com/s?%s'% res,
                     headers={
                    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'
                     },
k=input('输入关键字:  ').strip()
# res=urlencode({'wd':k},encoding='utf-8')  #url编码
respose=requests.get('https://www.baidu.com/s?',
                     headers={
                    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'
                     },
                     params={'wd':k}


                     )
with open('a.html','w',encoding='utf-8') as f:
    f.write(respose.text)
respose=requests.get('https://www.baidu.com/s?',
                     headers={
                    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'
                     },
                     params={'wd':k},
                     Cookies={'user_session':'wGMHFJKgDcmRIVvcA14_Wrt_3xaUyJNsBnPbYzEL6L0bHcfc'},

                     )

#allow_redirects=False   禁止根据resposes的响应头的location做页面跳转,默认是true跳转;
#设置为flase可以停留在本次请求(request),获取本次响应(responses)响应头,让跳转的loction地址;否则跳转了获取得就是跳转之后页面的响应内容了!
r3=session.get('https://passport.lagou.com/grantServiceTicket/grant.html',
               headers={
                   'Referer':'//passport.lagou.com/login/login.html',
                   'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
                   'Host':'passport.lagou.com',
                    },
               allow_redirects = False  # 禁止授权完成之后,禁止做页面跳转
               ,
               )

三、requests发送POST请求

requests.post()用法与requests.get()完全一致,特殊的是requests.post()有一个data参数,用来存放请求体数据,也就是POST请求的请求体
发送post请求,模拟浏览器的登录github

import requests
import re

#访问登录页面
r1=requests.get('https://github.com/login/',
                     headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'},

                     )

authenticity_token=re.findall(r'name="authenticity_token".*?value="(.*?)"',r1.text,re.S)[0]
# print(r1.cookies.items()) #获取元祖类型的cookies信息
# print(r1.cookies.get_dict())#获取字典类型的cokies信息
cookies=r1.cookies.get_dict()


#访问登录页面
r2=requests.post('https://github.com/session',
    data={
    'commit':'Sign in',
    'utf8':'✓',
    'authenticity_token':authenticity_token,
    'login':'13220198866@163.com',
    'password':'123.com'},
    headers = {
 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'},
    cookies=cookies)


#访问设置个人主页
cookies2=r2.cookies.get_dict() #获取登录页面返回的cokies信息
r3=requests.get('https://github.com/settings/emails',cookies=cookies2)

print('13220198866@163.com' in r3.text  )

3、使用request.post() 之 content-type
requests.post(url='xxxxxxxx',
              data={'xxx':'yyy'}) #没有指定请求头,#默认的请求头:application/x-www-form-urlencoed

#如果需要向server端传说json数据,必须设置 content-ype:application/json,并且用data传值, 否则服务端取不到值
requests.post(url='',
              data={'':1,},
              headers={
                  'content-type':'application/json'
              })

四 、requests模块的响应Response

1、response属性
respose=requests.get('http://www.cnblogs.com/sss4/')
print(respose.text)  #显示文本内容
print(respose.content) #显示二进制内容(比如爬 图片 或视频需要)
print(respose.status_code) #返回的状态码
print(respose.headers) #获取响应头
print(respose.cookies) #获取服务端响应的cokies信息
print(respose.cookies.get_dict()) #获取字典形式的cokies信息
print(respose.cookies.items()) #获取列表类型的cookis信息
print(respose.url) #获取请求的URLhttp://www.cnblogs.com/sss4/
print(respose.history)#获取跳转前的url
print(respose.json()) #获取json数据
respose.encoding='gbk'#设置 requests模块的编码

五、requests模块的高级用法

1、SSL Cert Verification(验证证书)
import requests
from requests.packages import urllib3
urllib3.disable_warnings() #关闭警告
respone=requests.get('https://www.12306.cn',verify=False)
print(respone.status_code)
import requests
respone=requests.get('https://www.12306.cn',
                     cert=('/path/server.crt',
                           '/path/key'))
print(respone.status_code)
2、使用爬虫代理
#官网链接: http://docs.python-requests.org/en/master/user/advanced/#proxies

#代理设置:先发送请求给代理,然后由代理帮忙发送(封ip是常见的事情)
import requests
proxies={
    'http':'http://egon:123@localhost:9743',#带用户名密码的代理,@符号前是用户名与密码
    'http':'http://localhost:9743',
    'https':'https://localhost:9743',
}
respone=requests.get('https://www.12306.cn',
                     proxies=proxies)

print(respone.status_code)



#支持socks代理,安装:pip install requests[socks]
import requests
proxies = {
    'http': 'socks5://user:pass@host:port',
    'https': 'socks5://user:pass@host:port'
}
respone=requests.get('https://www.12306.cn',
                     proxies=proxies)

print(respone.status_code)
3、超时设置
import requests

result=requests.get('https://www.baidu.com/',timeout=0.0001 )  #timeout=0.0001 代表 请求+接收服务端数据的总时间;

#如果想明确控制  连接 和 等待接收服务端数据的时间timeout=(1,2))
result2=requests.get('https://www.baidu.com/',timeout=(1,2)) #timeout=(0.1,0.2)#0.1代表链接超时时间  0.2代表接收数据的超时时间
4、 认证设置
#官网链接:http://docs.python-requests.org/en/master/user/authentication/

#认证设置:登陆网站是,弹出一个框,要求你输入用户名密码(与alter很类似),此时是无法获取html的
# 但本质原理是拼接成请求头发送
#         r.headers['Authorization'] = _basic_auth_str(self.username, self.password)
# 一般的网站都不用默认的加密方式,都是自己写
# 那么我们就需要按照网站的加密方式,自己写一个类似于_basic_auth_str的方法
# 得到加密字符串后添加到请求头
#         r.headers['Authorization'] =func('.....')

#看一看默认的加密方式吧,通常网站都不会用默认的加密设置
import requests
from requests.auth import HTTPBasicAuth

r=requests.get('xxx',auth=HTTPBasicAuth('user','password'))
print(r.status_code)

#HTTPBasicAuth可以简写为如下格式
import requests

r=requests.get('xxx',auth=('user','password'))
print(r.status_code)
5、requests模块自带异常处理
#异常处理
import requests
from requests.exceptions import * #可以查看requests.exceptions获取异常类型

try:
    r=requests.get('http://www.baidu.com',timeout=0.00001)
except ReadTimeout:
    print('===:')
# except ConnectionError: #网络不通
#     print('-----')
# except Timeout:
#     print('aaaaa')

except RequestException:
    print('Error')
6、使用requests模块上传文件
import requests
files={'file':open('a.jpg','rb')}
respone=requests.post('http://httpbin.org/post',files=files)
print(respone.status_code)

六、requests.session()方法

session= requests.session() #相当于设置了 一个会话相关的容器,把所有会话相关的cookie都存放起来(自动保存cookie问题)

# session()中方法和requests()中一样
# session.get()  session.post()
session = requests.session()
# 使用session发送post请求获取cookie保存到本地session中。
# 以人人网登录为例。
post_url = "http://www.renren.com/PLogin.do"
headers = {"User-Agent": "Mozilla/5.0"}
session = requests.session()
post_data = {"email": "username", "password": "password"}
session.post(post_url, headers=headers, data=post_data)
# 使用session请求登录后的页面
# 得到登录后的网页内容
url = "http://www.renren.com/xxxxx/profile"
response = session.get(url, headers=headers)
上一篇 下一篇

猜你喜欢

热点阅读