python爬取微博内容-简版
2020-04-10 本文已影响0人
阪本先生_
首先地址是m.weibo.cn,不是网页版的,这是手机版的




'''
import requests
from bs4 import BeautifulSoup
import json
from pyquery import PyQuery as pq
from pymongo import MongoClient
headers = {
'Host': 'm.weibo.cn',
'Referer': 'https://m.weibo.cn/u/5088862652',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400',
'X-Requested-With': 'XMLHttpRequest'
}
# params = {
# 'uid': '5088862652',
# 'luicode': '10000011',
# 'lfid': '231093_-_selffollowed',
# 'type': 'uid',
# 'value': '5088862652',
# 'containerid': '1076035088862652',
# 'since_id': '4485032922577236',}
# base_url = 'https://m.weibo.cn/api/container/getIndex?'
# response = requests.get(url=base_url,headers=headers,params=params).json()
# since_id = response['data']['cardlistInfo']['since_id']
client = MongoClient()
db = client['weibo']
collection = db['weibo']
def get_page(page):
params = {
'uid': '5088862652',
'luicode': '10000011',
'lfid': '231093_-_selffollowed',
'type': 'uid',
'value': '5088862652',
'containerid': '1076035088862652',
}
if since_id != 0 :
params['since_id'] = since_id#添加到parmers中
base_url = 'https://m.weibo.cn/api/container/getIndex?'
response = requests.get(url=base_url,headers=headers,params=params).json()
# print(response)
return response
def parse_page(json):
content = json['data']['cards']#定位到需要的内容的节点处
for i in content:#遍历出每个的主要内容
weibo = {}
weibo['text'] = pq(i['mblog']['text']).text()#去掉文中 <br/ 多余的HTML信息
weibo['id'] = i['mblog']['id']
weibo['attitudes'] = i['mblog']['attitudes_count']#点赞数
weibo['comments'] = i['mblog']['comments_count']#评论数
yield weibo #传递到weibo{}
def save_to_mongo(result):
if collection.insert(result):
print("Mongo写入")
if __name__=='__main__':
since_id = 0
for page in range(1,5):
json = get_page(since_id)
since_id = json['data']['cardlistInfo']['since_id']#这里是下一页的since_id
print("======")
results = parse_page(json)
for result in results:
try:
print(result)
save_to_mongo(result)
except:
print('=' * 10 + "此内容无法显示" + "=" * 10)