爬去简书推荐作者的粉丝信息保存到mongodb数据库

2017-08-15  本文已影响17人  半杯故事
import requests,time
from lxml import etree
from pymongo import MongoClient


def requestGet(url):
    r = requests.get(url, headers=headers)
    html = r.text
    select = etree.HTML(html)
    return select

def get_fens_info(url,fens_count):

    try:
        # select = requestGet(url)

        fens_page = int(fens_count)//9+1 if int(fens_count)%9 == 0 else int(fens_count)//9+2
        print(fens_page)
        #粉丝数只有前100页有数据
        for page in range(1,100):
            print(page)
            select = requestGet(url+'?page={}'.format(page))

            infos = select.xpath('//div[@id="list-container"]//div[@class="info"]')
            for info in infos:
                name = info.xpath('a/text()')[0]
                text = info.xpath('div/text()')

                print(name,text)
                post_data = {
                    'name' : name,
                    'text' : text
                }
                #存放粉丝信息到表里
                result = posts.insert_one(post_data)
    except Exception as e:
        print('get_fens_info函数解析错误 错误为:',e)


#获取简书推荐作者的名字和url地址
def get_recommend_author_name():

    try:
        page_index = 1
        while True:
            select = requestGet(base_url + str(page_index))
            infos = select.xpath('//div[@class="wrap"][position()>0]')
            # print(len(infos))
            print(page_index)
            if len(infos) != 0:
                page_index += 1
                for info in infos:
                    name = info.xpath('a/h4/text()')[0]
                    print(name)
                    url = jianshu + info.xpath('a/@href')[0]
                    get_recommend_author_info(name, url)

            else:
                break

    except Exception as e:
        print("get_recommend_author_name函数解析错误 错误为 ", e)


#获取作者的粉丝数和关注数等信息
def get_recommend_author_info(name,url):

    try:
        select = requestGet(url)
        infos = select.xpath('//div[@class="meta-block"][position()>0]')
        # print(len(infos))
        guanzhu_count = infos[0].xpath('a/p/text()')[0]
        # guanzhu_url = jianshu + infos[0].xpath('a/@href')[0]
        fensi_count = infos[1].xpath('a/p/text()')[0]
        fensi_url = jianshu + infos[1].xpath('a/@href')[0]
        wenzhang_count = infos[2].xpath('a/p/text()')[0]
        zishu_count = infos[3].xpath('p/text()')[0]
        xihuan_count = infos[4].xpath('p/text()')[0]

        # print(guanzhu_url, fensi_url)
        print(guanzhu_count, fensi_count, wenzhang_count, zishu_count, xihuan_count)
        get_fens_info(fensi_url,fensi_count)

        return True

    except Exception as e:
        print('get_recommend_author_info函数解析错误 错误为:',e)


if __name__ == "__main__":
    jianshu = 'http://www.jianshu.com'
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
    headers = {
        'User-Agent': user_agent
    }

    base_url = 'http://www.jianshu.com/recommendations/users?page='

    start = time.time()
    client = MongoClient()

    # 使用上面的代码片段,将建立连接到默认主机(localhost)和端口(27017)。您还可以指定主机和 / 或使用端口:
    client = MongoClient('localhost', 27017)
    # 或者使用MongoURl格式:
    # client = MongoClient('mongodb://localhost:27017')

    #连接fens_db这个数据库
    db = client.fens_db
    #posts表名
    posts = db.posts

    get_recommend_author_name()
    end = time.time()
    # print(UserName)
    print("总耗时 %0.3f" % (end - start))

屏幕快照 2017-08-15 上午9.00.12.png
上一篇下一篇

猜你喜欢

热点阅读