python爬取微博内容-简版

2020-04-10 本文已影响0人阪本先生_
首先地址是m.weibo.cn，不是网页版的，这是手机版的
1.jpg
2.jpg
3.jpg
4.jpg
'''
import requests
from bs4 import BeautifulSoup
import json
from pyquery import PyQuery as pq
from pymongo import MongoClient

headers = {
'Host': 'm.weibo.cn',
'Referer': 'https://m.weibo.cn/u/5088862652',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400',
'X-Requested-With': 'XMLHttpRequest'
    }
# params = {
#         'uid': '5088862652',
#         'luicode': '10000011',
#         'lfid': '231093_-_selffollowed',
#         'type': 'uid',
#         'value': '5088862652',
#         'containerid': '1076035088862652',
#         'since_id': '4485032922577236',}
# base_url = 'https://m.weibo.cn/api/container/getIndex?'
# response = requests.get(url=base_url,headers=headers,params=params).json()
# since_id = response['data']['cardlistInfo']['since_id']

client = MongoClient()
db = client['weibo']
collection = db['weibo']



def get_page(page):
    params = {
        'uid': '5088862652',
        'luicode': '10000011',
        'lfid': '231093_-_selffollowed',
        'type': 'uid',
        'value': '5088862652',
        'containerid': '1076035088862652',

    }
    if since_id != 0 :
        params['since_id'] = since_id#添加到parmers中
    base_url = 'https://m.weibo.cn/api/container/getIndex?'
    response = requests.get(url=base_url,headers=headers,params=params).json()
    # print(response)
    return response

def parse_page(json):
    content = json['data']['cards']#定位到需要的内容的节点处
    for i in content:#遍历出每个的主要内容
        weibo = {}
        weibo['text'] = pq(i['mblog']['text']).text()#去掉文中 <br/ 多余的HTML信息
        weibo['id'] = i['mblog']['id']
        weibo['attitudes'] = i['mblog']['attitudes_count']#点赞数
        weibo['comments'] = i['mblog']['comments_count']#评论数
        yield weibo #传递到weibo{}

def save_to_mongo(result):
    if collection.insert(result):
        print("Mongo写入")

if __name__=='__main__':
    since_id = 0
    for page in range(1,5):
        json = get_page(since_id)
        since_id = json['data']['cardlistInfo']['since_id']#这里是下一页的since_id
        print("======")

        results = parse_page(json)
        for result in results:
            try:
                print(result)
                save_to_mongo(result)
            except:
                print('=' * 10 + "此内容无法显示" + "=" * 10)
python爬取微博内容-简版

猜你喜欢

热点阅读