网络爬虫

近期的计划-出一系列爬虫的文章1

2017-04-12  本文已影响197人  nonoBoy

1、自己写的新浪微博爬虫(基于cookie登录):

1.1 第一步 一个单线程爬虫,以某一个账户为入口通过宽度搜索爬取更多的用户ID以及粉丝数

import requests
from lxml import etree
import time
import random

headers = {
    'User-Agent': 'Avant Browser/1.2.789rel1 (http://www.avantbrowser.com)',
    'Cookie': '你的cookie'
}

def getFans(url):
    response = requests.get(url, headers = headers)
    selector = etree.HTML(response.content) #content
    #解析粉丝列表
    all_fans= selector.xpath("//td[@valign='top']//a")
    all_fans_href= selector.xpath("//td[@valign='top']//a/@href")
    fans_num = selector.xpath("//td[@valign='top']//br/following-sibling::text()")

    for i in range(int(len(all_fans)/3)):
        #总共有i个 只取第2个
        fan = all_fans[3*i + 1].xpath('string(.)')
        fan_href = all_fans_href[3*i + 1]
        numOfFans = fans_num[i]
        print(fan + ', ' + fan_href + ', ' + str(numOfFans))

        #写入本地文件
        file = open('crawledWeiboIDs02.txt', 'a')
        file.write(fan + ',' + fan_href + ',' + str(numOfFans) + '\n')
        file.close()

        #第二层 只爬第一页
        fan_href = 'https://weibo.cn/' + fan_href[-10:] + '/fans'
        getFans_son(fan_href)
        delay()

def getFans_son(url):
    response = requests.get(url, headers = headers)
    selector = etree.HTML(response.content) #content
    #解析粉丝列表
    all_fans= selector.xpath("//td[@valign='top']//a")
    all_fans_href= selector.xpath("//td[@valign='top']//a/@href")
    fans_num = selector.xpath("//td[@valign='top']//br/following-sibling::text()")

    for i in range(int(len(all_fans)/3)):
        #总共有i个 只取第2个
        fan = all_fans[3*i + 1].xpath('string(.)')
        fan_href = all_fans_href[3*i + 1]
        numOfFans = fans_num[i]
        print(fan + ', ' + fan_href + ', ' + str(numOfFans))
        #写入文件
        file = open('crawledWeiboIDs02.txt', 'a')
        file.write(fan + ',' + fan_href + ',' + str(numOfFans) + '\n')
        file.close()

def delay():
    delay = 1.5 + random.random()
    time.sleep(1.5 + random.random())

# 分析 30  0 1 2   3 4 5   6 7 8   .... 1 4 7 10   -> 3 * i + 1
# 抓2层: 1 -> 2 -> 3 : (20pages * 10)* (10) = 2000; next: 2000 * [(20pages * 10)* (10)] = 4000 000 ; 选取粉丝大于200的
# 2000个cookie  10几分钟能够爬取到400w用户 但是实际上只爬取了200个页面 -> 10几分钟可以爬去 200 * 1000(cookie) = 200 000 个页面(账户)
#即 监控20 0000万个用户 一天内可以监控: 6 * 24 * 20w = 2880 w个账户 即:2.9kw左右 如果要达到一个亿 需要 5000个cookie

for i in range(1, 21): #粉丝只能显示20页 并不能很全地显示所有粉丝
    url = 'https://weibo.cn/2376442895/fans?page=' + str(i)
    getFans(url)
    delay()

1.2 十几分钟跑完上面代码的循环,获得用户2000左右(去重后得带unique用户1500左右);下一步,通过多个(num=5)cookie,开启多个进程抓取,此时设置代理;
由于多进程涉及到队列管理,这里先要把需要抓取的url放入队列,并标记是否已经抓取过。下面写入数据库操作,并用status标识是否抓取过:

from mongodb_queue import MongoQueue
import re

spider_queue = MongoQueue('sinaWeibo', 'usersInfo')

def start():
    file = open('crawledWeiboIDs02.txt', 'r')

    while True:
        line = file.readline()
        if line:
            result = re.split(',', line)
            spider_queue.my_push(result[1], result[0], result[2][2:-2])
        else:
            break
    file.close()

if __name__ == "__main__":
    start()
    # spider_queue.clear()

crawledWeiboIDs02.txt数据格式:

小可爱Karley,https://weibo.cn/u/6056331817,粉丝16人
通红240,https://weibo.cn/u/6010937618,粉丝63人
一个它正传-,https://weibo.cn/u/5098717649,粉丝27人
小刀羊也疯狂,https://weibo.cn/u/3904193725,粉丝11人
一个咸姐姐,https://weibo.cn/u/2061408941,粉丝315人
蓝颜大海怪,https://weibo.cn/u/2335040577,粉丝300人
....
#MongoDB模块
from datetime import datetime, timedelta
from pymongo import MongoClient, errors

class MongoQueue():

    OUTSTANDING = 1 #initial
    PROCESSING = 2  #downloading..
    COMPLETE =3    #finished

    def __init__(self, db, collection, timeout=300):
        self.client = MongoClient('localhost', 27017)

        self.Client = self.client[db]
        self.db = self.Client[collection]

        self.timeout = timeout

    def __bool__(self):

        record = self.db.find_one({'status':{'$ne':self.COMPLETE}})
        return True if record else False

    def my_push(self, url, name, numOfFans):  # 添加新的URL进队列
        try:
            self.db.insert({'_id': url, 'status': self.OUTSTANDING, '微博名称': name, '粉丝数': int(numOfFans)})
            print(url, "插入队列成功")
        except errors.DuplicateKeyError as e: #报错则代表已经存在于队列中
            print(url, "已经存在与队列中")
            pass

    def peek(self):
        record = self.db.find_one({'status': self.OUTSTANDING})
        if record:
            return record['_id']

    def complete(self, url):
        self.db.update({'_id': url}, {'$set':{'status': self.COMPLETE}})

    #mydefine
    def reset(self,url):
        self.db.update({'_id': url}, {'$set':{'status': self.OUTSTANDING}})

    def repair(self):
        record = self.db.find_and_modify(query={'timestamp':{'$lt':datetime.now() - timedelta(seconds=self.timeout)},
                                                'status':{'$ne':self.COMPLETE}
                                                }, update={'$set':{'status':self.OUTSTANDING}})
        if record:
            print('重置URL状态', record['_id'])

    def clear(self):
        self.db.drop()

    #pop测试 看看

#Download 模块
import requests
import re
import random
import time

#写五个get方法 对应5个cookie  其实需要更新代理IP了
class download:
    def __init__(self):

        self.iplist = []
        #第一种 IP现抓现用
        # html = requests.get("http://haoip.cc/tiqu.htm")
        # iplistn =  re.findall(r'r/>(.*?)<b', html.text, re.S)
        # for ip in iplistn:
        #     i = re.sub('\n', '', ip)
        #     self.iplist.append(i.strip())

        # 新的代理IP
        file = open('delegateIPs.txt')
        line = file.readline()
        while line:
            self.iplist.append(line)
            line = file.readline()
        file.close()

        self.user_agent_list = [
                    "Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
                    "Avant Browser/1.2.789rel1 (http://www.avantbrowser.com)"
                  #...
                ]

    def get(self, url, mycookie, timeout, proxy = None, num_reties = 6):

        UA = random.choice(self.user_agent_list)
        headers = {'User-Agent': UA,
                   'Cookie': mycookie
                   }
        # print(UA + "-------")

        if proxy == None:
            try:
                return requests.get(url, headers=headers, timeout = timeout)
            except:
                if num_reties > 0:
                    time.sleep(10)
                    print(u'获取网页出错,10s后将获取倒数第:', num_reties, u'次')
                    return self.get(url, mycookie, timeout,num_reties - 1)
                else:
                    print(u'开始使用代理')
                    time.sleep(10)
                    IP = ''.join(str(random.choice(self.iplist)).strip())
                    proxy = {'http': IP}
                    return self.get(url, mycookie, timeout, proxy)

        else:
            try:
                IP = ''.join(str(random.choice(self.iplist)).strip())
                # print(IP + "----")
                proxy = {'http': IP}
                response = requests.get(url, headers=headers, proxies= proxy, timeout = timeout)
                return response
            except:
                if num_reties > 0:
                    time.sleep(10)
                    IP = ''.join(str(random.choice(self.iplist)).strip())
                    proxy = {'http': IP}
                    print(IP)
                    print(u'正在更换代理,10S后将重新获取倒数第', num_reties, u'次')
                    print(u'当前代理是:', proxy)
                    return  self.get(url, mycookie, timeout, proxy, num_reties -1)
                else:
                    print(u'代理也不好使了!取消代理')
                    return self.get(url, mycookie, 3)

request = download()

delegateIPs.txt格式

220.189.191.2:8998
182.38.113.84:808
110.73.1.89:8123
121.232.145.115:9000
118.26.183.215:8080
14.216.162.50:8998
218.104.148.157:8080
58.217.8.244:808
182.99.240.231:9000
......
#多进程爬虫代码
import requests
from lxml import etree
import multiprocessing
from mongodb_queue import MongoQueue
import time
import random
from Download import request

spider_queue = MongoQueue('sinaWeibo', 'usersInfo')

#构造cookie数组  同一个IP 多个cookie是否可行 IP网段多个微博账

mycookie = ['你的cookie1',
            '你的cookie2',
            '你的cookie3',
            '你的cookie4',
            '你的cookie5',       
            ]
def process():
   while True:
       try:
           url = spider_queue.pop()
       except KeyError:
           print('队列咩有数据')
           break
       else:
           if p.name == 'Process-1' or p.name == 'Process-6':
               # 构造fans列表链接
               fans_url = 'https://weibo.cn/' + url[-10:] + '/fans'
               # 爬取函数
               getFans(fans_url, mycookie[0])
               delay()
               # 爬取完就设置url的status
               # 暂时不做判断是否爬取成功
               spider_queue.complete(url)
               # print('执行方法1')
           if p.name == 'Process-2' or p.name == 'Process-7':
               url = spider_queue.pop()
               fans_url = 'https://weibo.cn/' + url[-10:] + '/fans'
               getFans(fans_url, mycookie[1])
               delay()
               spider_queue.complete(url)
           if p.name == 'Process-3' or p.name == 'Process-8':
               fans_url = 'https://weibo.cn/' + url[-10:] + '/fans'
               getFans(fans_url, mycookie[2])
               delay()
               spider_queue.complete(url)
           if p.name == 'Process-4' or p.name == 'Process-9':
               fans_url = 'https://weibo.cn/' + url[-10:] + '/fans'
               getFans(fans_url, mycookie[3])
               delay()
               spider_queue.complete(url)
           if p.name == 'Process-5' or p.name == 'Process-10':
               fans_url = 'https://weibo.cn/' + url[-10:] + '/fans'
               getFans(fans_url, mycookie[4])
               delay()
               spider_queue.complete(url)

def getFans(url, mycookie):
    response = request.get(url, mycookie, 3)#requests.get(url, headers = headers)
    selector = etree.HTML(response.content) #content

    #解析粉丝列表
    all_fans= selector.xpath("//td[@valign='top']//a")
    all_fans_href= selector.xpath("//td[@valign='top']//a/@href")
    fans_num = selector.xpath("//td[@valign='top']//br/following-sibling::text()")

    for i in range(int(len(all_fans)/3)):
        #总共有i个 只取第2个
        fan = all_fans[3*i + 1].xpath('string(.)')
        fan_href = all_fans_href[3*i + 1]
        numOfFans = fans_num[i]
        print(fan + ', ' + fan_href + ', ' + str(numOfFans))

        file = open('crawledWeiboIDs03.txt', 'a')
        file.write(fan + ',' + fan_href + ',' + str(numOfFans) + '\n')
        file.close()

        #只爬第一页
        fan_href = 'https://weibo.cn/' + fan_href[-10:] + '/fans'
        # print(fan_href)

        getFans_son(fan_href, mycookie)
        delay()

def getFans_son(url, mycookie):
    response = request.get(url, mycookie, 3)
    selector = etree.HTML(response.content) #content

    #解析粉丝列表
    all_fans= selector.xpath("//td[@valign='top']//a")
    all_fans_href= selector.xpath("//td[@valign='top']//a/@href")
    fans_num = selector.xpath("//td[@valign='top']//br/following-sibling::text()")

    for i in range(int(len(all_fans)/3)):
        #总共有i个 只取第2个
        fan = all_fans[3*i + 1].xpath('string(.)')
        fan_href = all_fans_href[3*i + 1]
        numOfFans = fans_num[i]
        print(fan + ', ' + fan_href + ', ' + str(numOfFans))
        #写入文件
        file = open('crawledWeiboIDs03.txt', 'a')
        file.write(fan + ',' + fan_href + ',' + str(numOfFans) + '\n')
        file.close()

def delay():
    delay = 1.5 + random.random()
    time.sleep(delay)

if __name__ == '__main__':
    for i in range(10):
        p = multiprocessing.Process(target=process)
        p.start()
        # print(p.name)

#需要重新搞一批代理IP
#非10位数字的处理 : 美女淘宝购物达人_娇楠宇, https://weibo.cn/mvdrqny, 粉丝695人
#优化:自动切换VPN的做法

通过设置多进程,获得用户数据50 000条;
格式如下:

姗Shan姐要开心,https://weibo.cn/u/5937337735,粉丝174人
用户6100420024,https://weibo.cn/u/6100420024,粉丝7人
郝芷云,https://weibo.cn/u/5121432097,粉丝605人
胡霞Jolly海,https://weibo.cn/u/3478857104,粉丝118人
白师2113,https://weibo.cn/u/5577545264,粉丝4572人
每日金股--工号15132,https://weibo.cn/u/5147380263,粉丝14749人
短线策略-刘,https://weibo.cn/u/5975343802,粉丝9337人
批发各种品牌女鞋,https://weibo.cn/u/6169371763,粉丝178人
西泽s君,https://weibo.cn/u/6060133763,粉丝441人
希望是新收获,https://weibo.cn/u/5569402145,粉丝2950人
Abby设计师品牌代购店,https://weibo.cn/u/6126585085,粉丝224人
威武云的快乐生活,https://weibo.cn/u/5976068356,粉丝190人
运城娟姐,https://weibo.cn/u/6171080440,粉丝62人
3045227010egr,https://weibo.cn/u/3045227010,粉丝205人
温柔你慧姐mmmmm,https://weibo.cn/u/5522201497,粉丝71人
ye_lan芍,https://weibo.cn/u/6073609908,粉丝885人
......

1.3 下一步,通过抓取用户主页url信息获得用户具体信息,包括粉丝,微博内容等等信息;
1.4 需要解决的问题:微博封IP的处理;

上一篇下一篇

猜你喜欢

热点阅读