Python网络爬虫实战之十一:利用API进行数据采集
2018-08-29 本文已影响178人
27efec53a72d
一、什么是API?
API(ApplicationProgrammingInterface,应用程序编程接口)是一些预先定义的函数,目的是提供应用程序与开发人员基于某软件或硬件得以访问一组例程的能力,而又无需访问源码,或理解内部工作机制的细节。
例如:
http://apis.juhe.cn/ip/ip2addr?ip=112.112.11.11&key=appkey
返回的json格式的数据是:
{
"resultcode":"200",
"reason":"Return Successd!",
"result":{
"area":"江苏省苏州市",
"location":"电信"
}
}
返回的xml格式的数据是:
<?xml version="1.0" encoding="utf-8" ?>
<root>
<resultcode>200</resultcode>
<reason>Return Successd!</reason>
<result>
<area>江苏省苏州市</area>
<location>电信</location>
</result>
</root>
二、使用Python调用API
1、使用python对json格式的数据解析
import json
jsonString = '{"arrayOfNums":[{"number":0},{"number":1},{"number":2}],"arrayOfFruits":[{"fruit":"apple"},{"fruit":"banana"},{"fruit":"pear"}]}'
jsonObj = json.loads(jsonString)
print(jsonObj.get("arrayOfNums"))
print(jsonObj.get("arrayOfNums")[1])
print(jsonObj.get("arrayOfNums")[1].get("number") + jsonObj.get("arrayOfNums")[2].get("number"))
print(jsonObj.get("arrayOfFruits")[2].get("fruit"))
2、使用python调用聚合数据中的天气预报API
from urllib import urlencode
import urllib
import json
# 配置您申请的APPKey
appkey = "XXXXXXXXXXXXXXXXXXXXXXXX"
# 根据城市查询天气
def queryWeather(appkey, m="GET", city="广州", dtype="json"):
url = "http://v.juhe.cn/weather/index"
params = {
"cityname": city, # 要查询的城市,如:温州、上海、北京
"key": appkey, # 应用APPKEY(应用详细页查询)
"dtype": dtype, # 返回数据的格式,xml或json,默认json
}
params = urlencode(params, )
if m == "GET":
f = urllib.urlopen("%s?%s" % (url, params))
else:
f = urllib.urlopen(url, params)
content = f.read()
res = json.loads(content)
if res:
error_code = res["error_code"]
if error_code == 0:
# 成功请求
return res["result"]
else:
print "%s:%s" % (res["error_code"], res["reason"])
else:
print "request api error"
weather = queryWeather(appkey, "GET")
print weather
print urllib.unquote(weather.get("sk").get("wind_direction"))
3、使用python调用聚合数据中的查询IP地址API
from urllib import urlopen
import json
def getCountry(ipAddress, appkey):
response = urlopen("http://apis.juhe.cn/ip/ip2addr?ip=" + ipAddress + "&key=" + appkey).read().decode('utf-8')
responseJson = json.loads(response)
return responseJson.get("area")
# 配置您申请的APPKey
appkey = "84bd1042092e7b0e3265483f46febc80"
print(getCountry("61.135.169.121", appkey))
4、使用python 2.x 调用微博API
微博的Python 2.x SDK:
http://github.liaoxuefeng.com/sinaweibopy/
https://github.com/michaelliao/sinaweibopy
安装sdk
pip install sinaweibopy
实例代码
from weibo import APIClient
import webbrowser
## 1、个人微博的账号信息
APP_KEY = 'XXXXXX'
APP_SECRET = 'XXXXXXXXXXXXXXXXXXXXXXXX'
CALLBACK_URL = 'http://f.dataguru.cn'
## 2、请求授权
# 2.1
client = APIClient(app_key=APP_KEY, app_secret=APP_SECRET, redirect_uri=CALLBACK_URL)
url = client.get_authorize_url()
# print(url)
# https://api.weibo.com/oauth2/authorize?redirect_uri=http%3A//f.dataguru.cn&response_type=code&client_id=2337575664
# 2.2
# 打开申请授权的网页,点击同意授权后会跳转到之前设置的回调网页(即CALLBACK_URL)
# 在回调页的浏览器地址栏里获取code(动态变化),用于第二步调用oauth2/access_token接口,获取授权后的access token
webbrowser.open_new(url)
# http://f.dataguru.cn/?code=6240f86a9c757ef6ea985cd28647f05a
code = '6240f86a9c757ef6ea985cd28647f05a'
## 3、获得授权
# 获取token 和 token的生命周期
r = client.request_access_token(code)
# print(r)
access_token = r.access_token
# print(access_token)
expires_in = r.expires_in
# print(expires_in)
## 4、为以后的API请求设置token
client.set_access_token(access_token, expires_in)
## 5、获取当前登录用户及其所关注(授权)用户的最新微博 statuses/home_timeline
# https://api.weibo.com/2/statuses/home_timeline.json
statuses = client.statuses.home_timeline.get(count=10)['statuses']
# print(statuses[1])
length = len(statuses)
print(length)
# 输出了部分信息
for i in range(0, length):
print(u'昵称:' + statuses[i]['user']['screen_name'])
print(u'简介:' + statuses[i]['user']['description'])
print(u'位置:' + statuses[i]['user']['location'])
print(u'微博:' + statuses[i]['text'])
## 6、获取最新的提到登录用户的微博列表,即@我的微博 statuses/mentions
# https://api.weibo.com/2/statuses/mentions.json
statuses = client.statuses.mentions.get()['statuses']
# print(statuses[1])
length = len(statuses)
print(length)
# 输出了部分信息
for i in range(0, length):
print(u'昵称:' + statuses[i]['user']['screen_name'])
print(u'简介:' + statuses[i]['user']['description'])
print(u'位置:' + statuses[i]['user']['location'])
print(u'微博:' + statuses[i]['text'])
print(u'时间:' + statuses[i]['created_at'])
5、使用python 3.x 调用微博API
微博的Python 3.x SDK:
https://github.com/nooperpudd/weibopy
安装sdk
pip install weibopy
实例代码
from weibopy import WeiboOauth2
import webbrowser
## 1、个人微博的账号信息
APP_KEY = 'XXXXXX'
APP_SECRET = 'XXXXXXXXXXXX'
CALLBACK_URL = 'http://f.dataguru.cn'
## 2、请求授权
# 2.1
client = WeiboOauth2(APP_KEY, APP_SECRET, CALLBACK_URL)
authorize_url = client.authorize_url
print(authorize_url)
# https://api.weibo.com/oauth2/authorize?redirect_uri=http%3A//f.dataguru.cn&response_type=code&client_id=2337575664
# 2.2
# 打开申请授权的网页,点击同意授权后会跳转到之前设置的回调网页(即CALLBACK_URL)
# 在回调页的浏览器地址栏里获取code(动态变化),用于第二步调用oauth2/access_token接口,获取授权后的access token
webbrowser.open_new(authorize_url)
# http://f.dataguru.cn/?code=4b156593e9dfdd16279bbcc9eb7817bf
code = '4b156593e9dfdd16279bbcc9eb7817bf'
## 3、获得授权
# 获取token 和 token的生命周期
r = client.auth_access(code)
# print(r)
access_token = r.get("access_token")
# print(access_token)
expires_in = r.expires_in
# print(expires_in)
## 4、为以后的API请求设置token
from weibopy import WeiboClient
client = WeiboClient(access_token)
## 5、获取当前登录用户及其所关注(授权)用户的最新微博 statuses/home_timeline
# https://api.weibo.com/2/statuses/home_timeline.json
result = client.get(suffix="statuses/home_timeline.json")
statuses = result.get("statuses")
# print(statuses[0])
length = len(statuses)
print(length)
# 输出了部分信息
for i in range(0, length):
print(u'昵称:' + statuses[i]['user']['screen_name'])
print(u'简介:' + statuses[i]['user']['description'])
print(u'位置:' + statuses[i]['user']['location'])
print(u'微博:' + statuses[i]['text'])
## 6、获取最新的提到登录用户的微博列表,即@我的微博 statuses/mentions
# https://api.weibo.com/2/statuses/mentions.json
result = client.get(suffix="statuses/mentions.json")
statuses = result.get("statuses")
# print(statuses[0])
length = len(statuses)
print(length)
# 输出了部分信息
for i in range(0, length):
print(u'昵称:' + statuses[i]['user']['screen_name'])
print(u'简介:' + statuses[i]['user']['description'])
print(u'位置:' + statuses[i]['user']['location'])
print(u'微博:' + statuses[i]['text'])
print(u'时间:' + statuses[i]['created_at'])