近期的计划-出一系列爬虫的文章1
2017-04-12 本文已影响197人
nonoBoy
1、自己写的新浪微博爬虫(基于cookie登录):
1.1 第一步 一个单线程爬虫,以某一个账户为入口通过宽度搜索爬取更多的用户ID以及粉丝数
import requests
from lxml import etree
import time
import random
headers = {
'User-Agent': 'Avant Browser/1.2.789rel1 (http://www.avantbrowser.com)',
'Cookie': '你的cookie'
}
def getFans(url):
response = requests.get(url, headers = headers)
selector = etree.HTML(response.content) #content
#解析粉丝列表
all_fans= selector.xpath("//td[@valign='top']//a")
all_fans_href= selector.xpath("//td[@valign='top']//a/@href")
fans_num = selector.xpath("//td[@valign='top']//br/following-sibling::text()")
for i in range(int(len(all_fans)/3)):
#总共有i个 只取第2个
fan = all_fans[3*i + 1].xpath('string(.)')
fan_href = all_fans_href[3*i + 1]
numOfFans = fans_num[i]
print(fan + ', ' + fan_href + ', ' + str(numOfFans))
#写入本地文件
file = open('crawledWeiboIDs02.txt', 'a')
file.write(fan + ',' + fan_href + ',' + str(numOfFans) + '\n')
file.close()
#第二层 只爬第一页
fan_href = 'https://weibo.cn/' + fan_href[-10:] + '/fans'
getFans_son(fan_href)
delay()
def getFans_son(url):
response = requests.get(url, headers = headers)
selector = etree.HTML(response.content) #content
#解析粉丝列表
all_fans= selector.xpath("//td[@valign='top']//a")
all_fans_href= selector.xpath("//td[@valign='top']//a/@href")
fans_num = selector.xpath("//td[@valign='top']//br/following-sibling::text()")
for i in range(int(len(all_fans)/3)):
#总共有i个 只取第2个
fan = all_fans[3*i + 1].xpath('string(.)')
fan_href = all_fans_href[3*i + 1]
numOfFans = fans_num[i]
print(fan + ', ' + fan_href + ', ' + str(numOfFans))
#写入文件
file = open('crawledWeiboIDs02.txt', 'a')
file.write(fan + ',' + fan_href + ',' + str(numOfFans) + '\n')
file.close()
def delay():
delay = 1.5 + random.random()
time.sleep(1.5 + random.random())
# 分析 30 0 1 2 3 4 5 6 7 8 .... 1 4 7 10 -> 3 * i + 1
# 抓2层: 1 -> 2 -> 3 : (20pages * 10)* (10) = 2000; next: 2000 * [(20pages * 10)* (10)] = 4000 000 ; 选取粉丝大于200的
# 2000个cookie 10几分钟能够爬取到400w用户 但是实际上只爬取了200个页面 -> 10几分钟可以爬去 200 * 1000(cookie) = 200 000 个页面(账户)
#即 监控20 0000万个用户 一天内可以监控: 6 * 24 * 20w = 2880 w个账户 即:2.9kw左右 如果要达到一个亿 需要 5000个cookie
for i in range(1, 21): #粉丝只能显示20页 并不能很全地显示所有粉丝
url = 'https://weibo.cn/2376442895/fans?page=' + str(i)
getFans(url)
delay()
1.2 十几分钟跑完上面代码的循环,获得用户2000左右(去重后得带unique用户1500左右);下一步,通过多个(num=5)cookie,开启多个进程抓取,此时设置代理;
由于多进程涉及到队列管理,这里先要把需要抓取的url放入队列,并标记是否已经抓取过。下面写入数据库操作,并用status标识是否抓取过:
from mongodb_queue import MongoQueue
import re
spider_queue = MongoQueue('sinaWeibo', 'usersInfo')
def start():
file = open('crawledWeiboIDs02.txt', 'r')
while True:
line = file.readline()
if line:
result = re.split(',', line)
spider_queue.my_push(result[1], result[0], result[2][2:-2])
else:
break
file.close()
if __name__ == "__main__":
start()
# spider_queue.clear()
crawledWeiboIDs02.txt数据格式:
小可爱Karley,https://weibo.cn/u/6056331817,粉丝16人
通红240,https://weibo.cn/u/6010937618,粉丝63人
一个它正传-,https://weibo.cn/u/5098717649,粉丝27人
小刀羊也疯狂,https://weibo.cn/u/3904193725,粉丝11人
一个咸姐姐,https://weibo.cn/u/2061408941,粉丝315人
蓝颜大海怪,https://weibo.cn/u/2335040577,粉丝300人
....
#MongoDB模块
from datetime import datetime, timedelta
from pymongo import MongoClient, errors
class MongoQueue():
OUTSTANDING = 1 #initial
PROCESSING = 2 #downloading..
COMPLETE =3 #finished
def __init__(self, db, collection, timeout=300):
self.client = MongoClient('localhost', 27017)
self.Client = self.client[db]
self.db = self.Client[collection]
self.timeout = timeout
def __bool__(self):
record = self.db.find_one({'status':{'$ne':self.COMPLETE}})
return True if record else False
def my_push(self, url, name, numOfFans): # 添加新的URL进队列
try:
self.db.insert({'_id': url, 'status': self.OUTSTANDING, '微博名称': name, '粉丝数': int(numOfFans)})
print(url, "插入队列成功")
except errors.DuplicateKeyError as e: #报错则代表已经存在于队列中
print(url, "已经存在与队列中")
pass
def peek(self):
record = self.db.find_one({'status': self.OUTSTANDING})
if record:
return record['_id']
def complete(self, url):
self.db.update({'_id': url}, {'$set':{'status': self.COMPLETE}})
#mydefine
def reset(self,url):
self.db.update({'_id': url}, {'$set':{'status': self.OUTSTANDING}})
def repair(self):
record = self.db.find_and_modify(query={'timestamp':{'$lt':datetime.now() - timedelta(seconds=self.timeout)},
'status':{'$ne':self.COMPLETE}
}, update={'$set':{'status':self.OUTSTANDING}})
if record:
print('重置URL状态', record['_id'])
def clear(self):
self.db.drop()
#pop测试 看看
#Download 模块
import requests
import re
import random
import time
#写五个get方法 对应5个cookie 其实需要更新代理IP了
class download:
def __init__(self):
self.iplist = []
#第一种 IP现抓现用
# html = requests.get("http://haoip.cc/tiqu.htm")
# iplistn = re.findall(r'r/>(.*?)<b', html.text, re.S)
# for ip in iplistn:
# i = re.sub('\n', '', ip)
# self.iplist.append(i.strip())
# 新的代理IP
file = open('delegateIPs.txt')
line = file.readline()
while line:
self.iplist.append(line)
line = file.readline()
file.close()
self.user_agent_list = [
"Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Avant Browser/1.2.789rel1 (http://www.avantbrowser.com)"
#...
]
def get(self, url, mycookie, timeout, proxy = None, num_reties = 6):
UA = random.choice(self.user_agent_list)
headers = {'User-Agent': UA,
'Cookie': mycookie
}
# print(UA + "-------")
if proxy == None:
try:
return requests.get(url, headers=headers, timeout = timeout)
except:
if num_reties > 0:
time.sleep(10)
print(u'获取网页出错,10s后将获取倒数第:', num_reties, u'次')
return self.get(url, mycookie, timeout,num_reties - 1)
else:
print(u'开始使用代理')
time.sleep(10)
IP = ''.join(str(random.choice(self.iplist)).strip())
proxy = {'http': IP}
return self.get(url, mycookie, timeout, proxy)
else:
try:
IP = ''.join(str(random.choice(self.iplist)).strip())
# print(IP + "----")
proxy = {'http': IP}
response = requests.get(url, headers=headers, proxies= proxy, timeout = timeout)
return response
except:
if num_reties > 0:
time.sleep(10)
IP = ''.join(str(random.choice(self.iplist)).strip())
proxy = {'http': IP}
print(IP)
print(u'正在更换代理,10S后将重新获取倒数第', num_reties, u'次')
print(u'当前代理是:', proxy)
return self.get(url, mycookie, timeout, proxy, num_reties -1)
else:
print(u'代理也不好使了!取消代理')
return self.get(url, mycookie, 3)
request = download()
delegateIPs.txt格式
220.189.191.2:8998
182.38.113.84:808
110.73.1.89:8123
121.232.145.115:9000
118.26.183.215:8080
14.216.162.50:8998
218.104.148.157:8080
58.217.8.244:808
182.99.240.231:9000
......
#多进程爬虫代码
import requests
from lxml import etree
import multiprocessing
from mongodb_queue import MongoQueue
import time
import random
from Download import request
spider_queue = MongoQueue('sinaWeibo', 'usersInfo')
#构造cookie数组 同一个IP 多个cookie是否可行 IP网段多个微博账
mycookie = ['你的cookie1',
'你的cookie2',
'你的cookie3',
'你的cookie4',
'你的cookie5',
]
def process():
while True:
try:
url = spider_queue.pop()
except KeyError:
print('队列咩有数据')
break
else:
if p.name == 'Process-1' or p.name == 'Process-6':
# 构造fans列表链接
fans_url = 'https://weibo.cn/' + url[-10:] + '/fans'
# 爬取函数
getFans(fans_url, mycookie[0])
delay()
# 爬取完就设置url的status
# 暂时不做判断是否爬取成功
spider_queue.complete(url)
# print('执行方法1')
if p.name == 'Process-2' or p.name == 'Process-7':
url = spider_queue.pop()
fans_url = 'https://weibo.cn/' + url[-10:] + '/fans'
getFans(fans_url, mycookie[1])
delay()
spider_queue.complete(url)
if p.name == 'Process-3' or p.name == 'Process-8':
fans_url = 'https://weibo.cn/' + url[-10:] + '/fans'
getFans(fans_url, mycookie[2])
delay()
spider_queue.complete(url)
if p.name == 'Process-4' or p.name == 'Process-9':
fans_url = 'https://weibo.cn/' + url[-10:] + '/fans'
getFans(fans_url, mycookie[3])
delay()
spider_queue.complete(url)
if p.name == 'Process-5' or p.name == 'Process-10':
fans_url = 'https://weibo.cn/' + url[-10:] + '/fans'
getFans(fans_url, mycookie[4])
delay()
spider_queue.complete(url)
def getFans(url, mycookie):
response = request.get(url, mycookie, 3)#requests.get(url, headers = headers)
selector = etree.HTML(response.content) #content
#解析粉丝列表
all_fans= selector.xpath("//td[@valign='top']//a")
all_fans_href= selector.xpath("//td[@valign='top']//a/@href")
fans_num = selector.xpath("//td[@valign='top']//br/following-sibling::text()")
for i in range(int(len(all_fans)/3)):
#总共有i个 只取第2个
fan = all_fans[3*i + 1].xpath('string(.)')
fan_href = all_fans_href[3*i + 1]
numOfFans = fans_num[i]
print(fan + ', ' + fan_href + ', ' + str(numOfFans))
file = open('crawledWeiboIDs03.txt', 'a')
file.write(fan + ',' + fan_href + ',' + str(numOfFans) + '\n')
file.close()
#只爬第一页
fan_href = 'https://weibo.cn/' + fan_href[-10:] + '/fans'
# print(fan_href)
getFans_son(fan_href, mycookie)
delay()
def getFans_son(url, mycookie):
response = request.get(url, mycookie, 3)
selector = etree.HTML(response.content) #content
#解析粉丝列表
all_fans= selector.xpath("//td[@valign='top']//a")
all_fans_href= selector.xpath("//td[@valign='top']//a/@href")
fans_num = selector.xpath("//td[@valign='top']//br/following-sibling::text()")
for i in range(int(len(all_fans)/3)):
#总共有i个 只取第2个
fan = all_fans[3*i + 1].xpath('string(.)')
fan_href = all_fans_href[3*i + 1]
numOfFans = fans_num[i]
print(fan + ', ' + fan_href + ', ' + str(numOfFans))
#写入文件
file = open('crawledWeiboIDs03.txt', 'a')
file.write(fan + ',' + fan_href + ',' + str(numOfFans) + '\n')
file.close()
def delay():
delay = 1.5 + random.random()
time.sleep(delay)
if __name__ == '__main__':
for i in range(10):
p = multiprocessing.Process(target=process)
p.start()
# print(p.name)
#需要重新搞一批代理IP
#非10位数字的处理 : 美女淘宝购物达人_娇楠宇, https://weibo.cn/mvdrqny, 粉丝695人
#优化:自动切换VPN的做法
通过设置多进程,获得用户数据50 000条;
格式如下:
姗Shan姐要开心,https://weibo.cn/u/5937337735,粉丝174人
用户6100420024,https://weibo.cn/u/6100420024,粉丝7人
郝芷云,https://weibo.cn/u/5121432097,粉丝605人
胡霞Jolly海,https://weibo.cn/u/3478857104,粉丝118人
白师2113,https://weibo.cn/u/5577545264,粉丝4572人
每日金股--工号15132,https://weibo.cn/u/5147380263,粉丝14749人
短线策略-刘,https://weibo.cn/u/5975343802,粉丝9337人
批发各种品牌女鞋,https://weibo.cn/u/6169371763,粉丝178人
西泽s君,https://weibo.cn/u/6060133763,粉丝441人
希望是新收获,https://weibo.cn/u/5569402145,粉丝2950人
Abby设计师品牌代购店,https://weibo.cn/u/6126585085,粉丝224人
威武云的快乐生活,https://weibo.cn/u/5976068356,粉丝190人
运城娟姐,https://weibo.cn/u/6171080440,粉丝62人
3045227010egr,https://weibo.cn/u/3045227010,粉丝205人
温柔你慧姐mmmmm,https://weibo.cn/u/5522201497,粉丝71人
ye_lan芍,https://weibo.cn/u/6073609908,粉丝885人
......
1.3 下一步,通过抓取用户主页url信息获得用户具体信息,包括粉丝,微博内容等等信息;
1.4 需要解决的问题:微博封IP的处理;