08 如何用爬虫下载所有微博收藏帖
2018-08-15 本文已影响5人
夏威夷的芒果
#下载地址:https://video.mugglecode.com/xcookie.py
c = '''fcc0; TC002''' # 这里请换成你的cookie
cookie_list = []
for one in c.split('; '):
k,v = one.split('=',1)
cookie_list.append({'name':k, 'value':v})
print(cookie_list)
正经的代码
这里用字典形式添加cookie,然后调用add_cookie()添加cookie就可以了。
# 代码中用到的xcookie.py模板请在这里下载: https://video.mugglecode.com/xcookie.py
from selenium.webdriver import Chrome
from bs4 import BeautifulSoup
import time
import sys
class Spider: #爬虫对象
def __init__(self, index_url):
self.index_url = index_url
self.raw_htmls = []
self.boot()
def boot(self):
self.chrome = Chrome(executable_path='./chromedriver')
self.chrome.start_client()
self.check_cookie()
def check_cookie(self):
from xcookie import cookie_list #从文件里导入cookie
if cookie_list:
self.chrome.get(self.index_url) #打开url
time.sleep(5)
self.chrome.delete_all_cookies() #删除一切的cookies
print('Clear!')
for c in cookie_list:
self.chrome.add_cookie(c) #逐条添加cookies
print('Done')
else:
print('pls add cookie first')
sys.exit()
def crawl(self, target_url):
self.chrome.get(target_url)
print('Wait for web page loading')
time.sleep(2)
self.raw_htmls.append(self.chrome.page_source) #那么多网页先都拿下来
class Parser:
def __init__(self, html_list):
self.html_list = html_list
self.raw_posts = []
self.parse()
def parse(self):
for html in self.html_list:
soup = BeautifulSoup(html,'html.parser')
detail_sel = '.WB_detail' #拿下来weibo
detail_els = soup.select(detail_sel)
for detail in detail_els:
content = detail.get_text() #可以拿下多级的文字
clean_text = content.replace(' ','').replace('\n','')
self.raw_posts.append(clean_text)
print(self.raw_posts)
def save_into_text(self):
with open('./fav.txt','a+') as f:
for i in self.raw_posts:
f.write(i)
f.write('\n')
f.write('---'*10)
f.write('\n')
s = Spider(index_url='https://www.weibo.com')
s.crawl(target_url='https://www.weibo.com/fav')
p = Parser(s.raw_htmls)
p.save_into_text()
time.sleep(9999)
注意,如果是手机的页面,必须用触摸
from selenium.webdriver.common.touch_actions import TouchActions # 用来模拟手机端操作