08 如何用爬虫下载所有微博收藏帖

2018-08-15 本文已影响5人夏威夷的芒果

#下载地址：https://video.mugglecode.com/xcookie.py
c = '''fcc0; TC002'''  # 这里请换成你的cookie
cookie_list = []
for one in c.split('; '):
    k,v = one.split('=',1)
    cookie_list.append({'name':k, 'value':v})
print(cookie_list)

正经的代码

这里用字典形式添加cookie，然后调用add_cookie()添加cookie就可以了。

# 代码中用到的xcookie.py模板请在这里下载： https://video.mugglecode.com/xcookie.py

from selenium.webdriver import Chrome
from bs4 import BeautifulSoup
import time
import sys

class Spider:   #爬虫对象

    def __init__(self, index_url):
        self.index_url = index_url
        self.raw_htmls = []
        self.boot()

    def boot(self):
        self.chrome = Chrome(executable_path='./chromedriver')
        self.chrome.start_client()
        self.check_cookie()

    def check_cookie(self):
        from xcookie import cookie_list   #从文件里导入cookie
        if cookie_list:
            self.chrome.get(self.index_url)        #打开url
            time.sleep(5)
            self.chrome.delete_all_cookies()     #删除一切的cookies
            print('Clear!')
            for c in cookie_list:     
                self.chrome.add_cookie(c)         #逐条添加cookies
            print('Done')
        else:
            print('pls add cookie first')
            sys.exit()


    def crawl(self, target_url):
        self.chrome.get(target_url)
        print('Wait for web page loading')
        time.sleep(2)
        self.raw_htmls.append(self.chrome.page_source)    #那么多网页先都拿下来



class Parser:

    def __init__(self, html_list):
        self.html_list = html_list
        self.raw_posts = []
        self.parse()


    def parse(self):
        for html in self.html_list:
            soup = BeautifulSoup(html,'html.parser')
            detail_sel = '.WB_detail'    #拿下来weibo
            detail_els = soup.select(detail_sel)
            for detail in detail_els:
                content = detail.get_text()   #可以拿下多级的文字
                clean_text = content.replace(' ','').replace('\n','')
                self.raw_posts.append(clean_text)
        print(self.raw_posts)


    def save_into_text(self):
        with open('./fav.txt','a+') as f:
            for i in self.raw_posts:
                f.write(i)
                f.write('\n')
                f.write('---'*10)
                f.write('\n')


s = Spider(index_url='https://www.weibo.com')
s.crawl(target_url='https://www.weibo.com/fav')
p = Parser(s.raw_htmls)
p.save_into_text()


time.sleep(9999)

注意，如果是手机的页面，必须用触摸

from selenium.webdriver.common.touch_actions import TouchActions # 用来模拟手机端操作

08 如何用爬虫下载所有微博收藏帖

正经的代码

注意，如果是手机的页面，必须用触摸

猜你喜欢

热点阅读