资料 | 汇总网页知乎

爬取个人知乎收藏夹内容生成网站

2017-02-24  本文已影响1670人  treelake

效果

电脑效果1 电脑效果2 电脑效果3 手机效果

爬虫

import os
import json
from bs4 import BeautifulSoup
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
# 参考 http://stackoverflow.com/questions/27981545/suppress-insecurerequestwarning-unverified-https-request-is-being-made-in-pytho
import requests_cache
requests_cache.install_cache('demo_cache')


Cookie_FilePlace = r'.'
Default_Header = {'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36",
                   'Host': "www.zhihu.com",
                   'Origin': "http://www.zhihu.com",
                   'Pragma': "no-cache",
                   'Referer': "http://www.zhihu.com/"}
Zhihu_URL = 'https://www.zhihu.com'
Login_URL = Zhihu_URL + '/login/email'
Profile_URL = 'https://www.zhihu.com/settings/profile'
Collection_URL = 'https://www.zhihu.com/collection/%d'
Cookie_Name = 'cookies.json'

os.chdir(Cookie_FilePlace)

r = requests.Session()

#--------------------Prepare--------------------------------#
r.headers.update(Default_Header)
if os.path.isfile(Cookie_Name):
    with open(Cookie_Name, 'r') as f:
        cookies = json.load(f)
        r.cookies.update(cookies)

def login(r):
    print('====== zhihu login =====')
    email = input('email: ')
    password = input("password: ")
    print('====== logging.... =====')
    data = {'email': email, 'password': password, 'remember_me': 'true'}
    value = r.post(Login_URL, data=data).json()
    print('====== result:', value['r'], '-', value['msg'])
    if int(value['r']) == 0:
        with open(Cookie_Name, 'w') as f:
            json.dump(r.cookies.get_dict(), f)

def isLogin(r):
    url = Profile_URL
    value = r.get(url, allow_redirects=False, verify=False)
    status_code = int(value.status_code)
    if status_code == 301 or status_code == 302:
        print("未登录")
        return False
    elif status_code == 200:
        return True
    else:
        print(u"网络故障")
        return False
        
if not isLogin(r):
    login(r)
    

#---------------------------------------------------------------------#
url_answer_dict= {}
# 单独生成一个答案的url和答案文本之间的字典,便于后台提供api服务,与123行相关

#-----------------------get collections-------------------------------#
def getCollectionsList():
    collections_list = []
    content = r.get(Profile_URL).content
    soup = BeautifulSoup(content, 'lxml')
    own_collections_url = 'http://' + soup.select('#js-url-preview')[0].text + '/collections'
    page_num = 0
    while True:
        page_num += 1
        url = own_collections_url + '?page=%d'% page_num
        content = r.get(url).content
        soup = BeautifulSoup(content, 'lxml')
        data = soup.select_one('#data').attrs['data-state']
        collections_dict_raw = json.loads(data)['entities']['favlists'].values()
        if not collections_dict_raw: 
        # if len(collections_dict_raw) == 0:
            break
        for i in collections_dict_raw:
            # print(i['id'],' -- ', i['title'])
            collections_list.append({
                'title': i['title'], 
                'url': Collection_URL % i['id'],
            })
    print('====== prepare Collections Done =====')
    return collections_list

#-------------------------
def getQaDictListFromOneCollection(collection_url = 'https://www.zhihu.com/collection/71534108'):
    qa_dict_list = []
    page_num = 0
    while True:
        page_num += 1
        url = collection_url + '?page=%d'% page_num
        content = r.get(url).content
        soup = BeautifulSoup(content, 'lxml')
        titles = soup.select('.zm-item-title a') # .text ; ['href']
        if len(titles) == 0:
            break
        votes = soup.select('.js-vote-count') # .text 
        answer_urls = soup.select('.toggle-expand') # ['href']
        answers = soup.select('textarea') # .text
        authors = soup.select('.author-link-line .author-link') # .text ; ['href']
        for title, vote, answer_url, answer, author \
        in zip(titles, votes, answer_urls, answers, authors):
            author_img = getAthorImage(author['href'])
            qa_dict_list.append({
                'title': title.text,
                'question_url': title['href'],
                'answer_vote': vote.text,
                'answer_url': answer_url['href'],
                #'answer': answer.text,
                'author': author.text,
                'author_url': author['href'],
                'author_img': author_img,
            })
            url_answer_dict[ 
                answer_url['href'][1:] 
            ] = answer.text
            # print(title.text, ' - ', author.text)
    return qa_dict_list

def getAthorImage(author_url):
    url = Zhihu_URL+author_url
    content = r.get(url).content
    soup = BeautifulSoup(content, 'lxml')
    return soup.select_one('.AuthorInfo-avatar')['src']

def getAllQaDictList():
    ''' 最终结果要是列表和字典的嵌套形式,以便前端解析'''
    all_qa_dict_list = []
    collections_list = getCollectionsList()
    for collection in collections_list:
        all_qa_dict_list.append({
            'ctitle': collection['title'],
            'clist': getQaDictListFromOneCollection(collection['url'])
        })
        print('====== getQa from %s Done =====' % collection['title'])
    return all_qa_dict_list


with open(u'知乎收藏文章.json', 'w', encoding='utf-8') as f:
    json.dump(getAllQaDictList(), f)

with open(u'url_answer.json', 'w', encoding='utf-8') as f:
    json.dump(url_answer_dict, f)
#---------------------utils------------------------------#
# with open('1.html', 'w', encoding='utf-8') as f:
    # f.write(soup.prettify())
# import os
# Cookie_FilePlace = r'.'
# os.chdir(Cookie_FilePlace)
# import json
# dict_ = {}
# with open(u'知乎收藏文章.json', 'r', encoding='utf-8') as f:
#     dict_ = json.load(f)

前端

<!DOCTYPE html>
<html lang="zh-CN">

<!--view-source:http://v3.bootcss.com/examples/jumbotron-narrow/#-->

<head>
    <meta charset="utf-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <title>知乎个人收藏</title>
    <link rel="stylesheet" href="https://cdn.bootcss.com/bootstrap/3.3.7/css/bootstrap.min.css">
    <link rel="stylesheet" href="http://v3.bootcss.com/examples/jumbotron-narrow/jumbotron-narrow.css">
    <link rel="stylesheet" type="text/css" href="http://unpkg.com/iview/dist/styles/iview.css">
</head>

<body>
    <div id="app">
        <div class="container">
            <div class="header clearfix">
                <h3 class="text-muted">知乎个人收藏</h3>
            </div>
            <div class="jumbotron">
                <h1>栏目总览</h1>
                <p class="lead">{{ description }}</p>
                <my-carousel></my-carousel>
            </div>
            <div class="row marketing">
                <div class="col-lg-6">
                    <my-card :collection="collection" v-for="collection in left"></my-card>
                </div>
                <div class="col-lg-6">
                    <my-card :collection="collection" v-for="collection in right"></my-card>
                </div>
            </div>
            <i-button @click="showLeave" style: "" long>That's all!</i-button>
            <Modal :visible.sync="visible" :title="modalTitle"> {{ modalMessage }}
                <div v-html="rawHtml" id="inner-content"></div>
            </Modal>

            <footer class="footer">
                <p>&copy; 2017 treelake.</p>
            </footer>
        </div>
        <!-- /container -->
    </div>

    <script type="text/javascript" src="http://v1.vuejs.org/js/vue.min.js"></script>
    <script src="https://cdn.jsdelivr.net/vue.resource/1.2.0/vue-resource.min.js"></script>
    <script type="text/javascript" src="http://unpkg.com/iview/dist/iview.min.js"></script>

    <script>
        Vue.component('my-carousel', {
            template: '<div class="showimage"><Carousel arrow="never" autoplay><Carousel-item>![](https://n2-s.mafengwo.net/fl_progressive,q_mini/s10/M00/74/B6/wKgBZ1irpQ-Afw_uAAepw3nE8w884.jpeg)</Carousel-item><Carousel-item>![](https://c4-q.mafengwo.net/s10/M00/21/50/wKgBZ1imrvqAafuJAAeRHcfhBBg66.jpeg?imageMogr2%2Finterlace%2F1)</Carousel-item></Carousel></div>'
        })


        Vue.component('my-ul', {
            template: '<ul id="list"><li v-for="item in items | limitBy limitNum limitFrom"><Badge :count="item.answer_vote" overflow-count="9999">     <a @click="simpleContent(item)" class="author-badge" :style="{ background: \'url(\'+ item.author_img +\') no-repeat\', backgroundSize:\'cover\'}"></a></Badge>   <a :href=" \'https://www.zhihu.com\' + item.answer_url" target="_blank" style="font-size: 10px">  &nbsp&nbsp&nbsp {{ item.title }}</a><a :href=" \'https://www.zhihu.com\' + item.question_url" target="_blank"><Icon type="chatbubbles"></Icon></a><hr>   </li></ul>',
            props: ['items'],
            methods: {
                changeLimit() {
                    if (this.limitFrom > this.items.length - this.limitNum) {
                        this.limitFrom = 0;
                    } else {
                        this.limitFrom += this.limitNum;
                    }
                    if (this.limitFrom == this.items.length) {
                        this.limitFrom = 0
                    }
                    console.log(this.limitFrom)
                },
                simpleContent(msg) {
                    this.$dispatch('child-msg', msg)
                    // 使用 $dispatch() 派发事件,事件沿着父链冒泡
                },
            },
            data() {
                return {
                    limitNum: 5,
                    limitFrom: 0,
                }
            },
            events: {
                'parent-msg': function () {
                    this.changeLimit()
                }
            },
        })


        Vue.component('my-card', {
            template: '<Card style="width:auto; margin-bottom:15px" ><p slot="title"><Icon type="ios-pricetags"></Icon>{{ collection.ctitle }}</p><a v-if="collection.clist.length>5" slot="extra" @click="notify"><Icon type="ios-loop-strong"></Icon>换一换</a>   <my-ul :items="collection.clist"></my-ul>   </Card>',
            props: ['collection'],
            methods: {
                notify: function () {
                    this.$broadcast('parent-msg')
                    // 使用 $broadcast() 广播事件,事件向下传导给所有的后代
                }
            }
        })

        var shuju, answer;
        new Vue({
            el: '#app',
            data: {
                description: '',
                visible: false,
                // ctitle: '',
                allqa: [],
                collection: {
                    'clist': [],
                    'ctitle': '',
                },
                left: [],
                right: [],
                modalMessage: '旧时光回忆完毕!',
                modalTitle: 'Welcome!',
                rawHtml: '<a href="https://treeinlake.github.io"> treelake </a>'
            },
            methods: {
                show() {
                    this.visible = true;
                },
                showLeave() {
                    this.rawHtml = '';
                    this.modalMessage = '旧时光回忆完毕!';
                    this.show();
                }
            },
            events: {
                'child-msg': function (msg) {
                    this.$http.jsonp('/find' + msg.answer_url, {}, { // 单文件测试:http://localhost:5000/find
                        headers: {},
                        emulateJSON: true
                    }).then(function (response) {
                        // 这里是处理正确的回调
                        answer = response.data;
                        this.rawHtml = answer.answer;
                    }, function (response) {
                        // 这里是处理错误的回调
                        console.log(response);
                    });
                    this.modalMessage = '';
                    this.modalTitle = msg.title;
                    this.show();
                }
            },
            ready: function () {
                this.$http.jsonp('/collections', {}, { // 单文件测试 http://localhost:5000/collections/
                    headers: {},
                    emulateJSON: true
                }).then(function (response) {
                    // 这里是处理正确的回调 
                    shuju = response.data
                    for (i in shuju) {
                        this.description += (shuju[i].ctitle + ' ');
                        // console.log(shuju[i])
                    }
                    // this.ctitle = shuju[0].ctitle
                    // this.collection = shuju[0]
                    this.allqa = shuju
                    half = parseInt(shuju.length / 2) + 1
                    this.left = shuju.slice(0, half)
                    this.right = shuju.slice(half, shuju.length)
                    console.log(this.collection)
                }, function (response) {
                    // 这里是处理错误的回调
                    console.log(response);
                });
            }
        })
    </script>

    <style>
        #list {
            padding: 10px
        }
        
        #list li {
            margin-bottom: 10px;
            padding-bottom: 10px;
        }
        
        .jumbotron img {
            width: 100%;
        }
        
        .author-badge {
            width: 38px;
            height: 38px;
            border-radius: 6px;
            display: inline-block;
        }
        
        #inner-content img {
            width: 100%;
        }
    </style>
</body>

</html>

后端

# -*- coding: utf-8 -*-
from flask import Flask
import json
from flask_jsonpify import jsonpify


app = Flask(__name__)

collections = []
with open(u'知乎收藏文章.json', 'r', encoding='utf-8') as f:
    collections = json.load(f)

qa_dict = {}
with open('url_answer.json', 'r', encoding='utf-8') as f:
    qa_dict = json.load(f)
# print(qa_dict['question/31116099/answer/116025931'])

index_html = ''
with open('zhihuCollection.html', 'r', encoding='utf-8') as f:
    index_html = f.read()


@app.route('/')
def index():
    return index_html


@app.route('/collections')
def collectionsApi():
    return jsonpify(collections)


@app.route('/find/<path:answer_url>') # 使用path修正斜杠的副作用,参见http://flask.pocoo.org/snippets/76/
def answersApi(answer_url):
    # show the post with the given id, the id is an integer
    return jsonpify({'answer': qa_dict[answer_url]})


@app.route('/test')
def test():
    # show the post with the given id, the id is an integer
    return jsonpify(qa_dict)


if __name__ == '__main__':
    app.run(host='0.0.0.0')

上一篇下一篇

猜你喜欢

热点阅读