Python 知识星球评论爬取 小demo
2019-01-30 本文已影响39人
Spareribs
cookie被我注释了,可以加上后运行
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
# @Time : 2019/1/30 19:53
# @Author : Spareribs
# @File : 知识星球.py
# @Software : PyCharm
# @Description :
"""
import json
import re
import urllib
import requests
headers = {
'accept': "application/json, text/plain, */*",
'origin': "https://wx.zsxq.com",
'x-version': "1.10.14",
'user-agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
'x-request-id': "73d45154-0e02-14c7-245c-cbe1cc083ee9",
'referer': "https://wx.zsxq.com/dweb/",
'accept-encoding': "gzip, deflate, br",
'accept-language': "en,zh-CN;q=0.9,zh;q=0.8,zh-TW;q=0.7",
'cookie': "***********************",
}
def get_likes(headers):
url = "https://api.zsxq.com/v1.10/topics/544455541888154/likes"
querystring = {"count": "30"}
response = requests.request("GET", url, headers=headers, params=querystring)
# print(json.dumps(json.loads(response.text.decode("utf-8")), indent=4, separators=(',', ': ')))
res_dict = json.loads(response.text)
if res_dict.get("resp_data").get("likes"):
for _link in res_dict.get("resp_data").get("likes"):
print(_link.get("owner").get("name"))
def get_topics(headers):
url = "https://api.zsxq.com/v1.10/topics/544455541888154"
response = requests.request("GET", url, headers=headers)
print(json.dumps(json.loads(response.text.decode("utf-8")), indent=4, separators=(',', ': ')))
def get_comments(headers):
reg_digital = "[1-9]\d*"
url = "https://api.zsxq.com/v1.10/topics/544455541888154/comments"
querystring = {"count": "30", "sort": "asc"}
response = requests.request("GET", url, headers=headers, params=querystring)
res_dict = json.loads(response.text)
comments = res_dict.get("resp_data").get("comments")
if comments:
for _comment in comments:
_create_time = _comment.get("create_time")
_text = _comment.get("text").encode("utf-8")
_id = re.match(reg_digital, _text.split("<")[0], re.M).group()
_url = urllib.unquote(urllib.unquote(re.findall(r'href="(.*?)"', _text.split("<")[1])[0]))
print("学号: {0} 打卡时间:{1} 链接:{2}".format(_id, _create_time, _url))
if __name__ == "__main__":
# get_likes(headers)
# get_topics(headers)
get_comments(headers)
输入的结果:
学号: 11 打卡时间:2019-01-26T19:22:46.579+0800 链接:https://mp.csdn.net/mdeditor/86660060#
学号: 38 打卡时间:2019-01-26T19:27:19.441+0800 链接:https://blog.csdn.net/ArmanAbdu/article/details/86658591
学号: 15 打卡时间:2019-01-26T20:08:30.696+0800 链接:https://blog.csdn.net/m0_38019841/article/details/86660438
学号: 13 打卡时间:2019-01-26T20:27:05.194+0800 链接:https://blog.csdn.net/kyolxs/article/details/86660659
学号: 24 打卡时间:2019-01-26T20:47:38.126+0800 链接:https://blog.csdn.net/weixin_43955166/article/details/86660044
学号: 10 打卡时间:2019-01-26T21:05:22.195+0800 链接:https://blog.csdn.net/baidu_36697353/article/details/86660909
学号: 30 打卡时间:2019-01-26T21:18:37.292+0800 链接:https://blog.csdn.net/qq_35547281/article/details/86660702
学号: 16 打卡时间:2019-01-26T21:32:31.842+0800 链接:https://blog.csdn.net/weixin_43569867/article/details/86661356
学号: 34 打卡时间:2019-01-26T21:45:00.445+0800 链接:https://blog.csdn.net/duffon_ze/article/details/86661478
学号: 41 打卡时间:2019-01-26T21:45:39.040+0800 链接:https://blog.csdn.net/weixin_44412976/article/details/86661520
学号: 42 打卡时间:2019-01-26T21:52:35.368+0800 链接:https://blog.csdn.net/submarineas/article/details/86660028
学号: 14 打卡时间:2019-01-26T22:09:41.117+0800 链接:https://blog.csdn.net/devcy/article/details/86661535
学号: 22 打卡时间:2019-01-26T23:55:29.683+0800 链接:https://shanjin.github.io/2019/01/26/leetcode-001-towsum/
学号: 40 打卡时间:2019-01-27T01:12:44.690+0800 链接:https://blog.csdn.net/xavierzz/article/details/86663117
学号: 26 打卡时间:2019-01-27T01:36:41.447+0800 链接:http://ywtail.github.io/2019/01/26/leetcode-1-两数之和/
学号: 47 打卡时间:2019-01-27T02:09:36.145+0800 链接:https://github.com/allen119/leetcode
学号: 20 打卡时间:2019-01-27T10:02:17.681+0800 链接:https://blog.csdn.net/Better_Y0808/article/details/86662671
学号: 48 打卡时间:2019-01-27T10:07:35.231+0800 链接:https://www.cnblogs.com/statlearning2019/p/10325665.html
学号: 35 打卡时间:2019-01-27T10:23:45.513+0800 链接:https://blog.csdn.net/weixin_43183978/article/details/86663814
学号: 9 打卡时间:2019-01-27T11:05:10.579+0800 链接:https://blog.csdn.net/Icy_D/article/details/86664049
学号: 19 打卡时间:2019-01-27T11:58:59.081+0800 链接:https://blog.csdn.net/lty159753/article/details/86661901
学号: 44 打卡时间:2019-01-27T12:38:31.554+0800 链接:http://zzyydd.com/2019/01/27/LeetCode_01_TwoSum/
学号: 28 打卡时间:2019-01-27T13:35:26.293+0800 链接:https://www.jianshu.com/p/41f677b2f47d
学号: 23 打卡时间:2019-01-27T15:09:09.342+0800 链接:https://blog.csdn.net/weixin_43399785/article/details/86665432
学号: 32 打卡时间:2019-01-27T15:12:42.511+0800 链接:https://blog.csdn.net/qq_33616637/article/details/86666281
学号: 24 打卡时间:2019-01-27T15:22:31.940+0800 链接:https://blog.csdn.net/qq_43701034/article/details/86654846
学号: 17 打卡时间:2019-01-27T15:48:00.008+0800 链接:https://blog.csdn.net/qq_23936173/article/details/86666642
学号: 39 打卡时间:2019-01-27T16:01:55.978+0800 链接:https://www.jianshu.com/p/b505ab32e416
学号: 33 打卡时间:2019-01-27T16:33:28.395+0800 链接:https://blog.csdn.net/weixin_44370010/article/details/86667077
学号: 43 打卡时间:2019-01-27T16:35:08.952+0800 链接:https://blog.csdn.net/qq_34778922/article/details/86663524