虎牙、B站网页信息python抓取试试

2021-08-24  本文已影响0人  千转军师

利用虎牙和b站的网页来抓取用户及其粉丝数量

使用时要注意:
(1)cmd 命令下 python test.py
后面有参数,如果没敲参数,会有提示,如:

useage: python test.py <roomIdStart[0:10000000]> <roomIdInterval[0:10000]> <multiProcessNum[0:20]> <loopTimes[0:20]>
param num is less than 5, num is 1

含义是:起始房间号、搜索组的大小、线程数、循环次数
搜索的总共房间数 = 搜索组的大小 * 线程数 * 循环次数
(2)结果
结果放在文件 data/target.txt
注意事先创建文件夹 data,否则提示没有 data/target.txt
(3)例子(虎牙):

================= new record ==================
time:2022-02-20 10:18:55
roomIdStart:1000
roomIdInterval:10
multiProcessNum:18
loopTimes:2
range[1000:1360]
===============================================
1066 电子厂-心态 7945900 1539218884
1123 奇领颜韵Ycy【万徒】 117598 1199552286636
==========================
总共时18.54秒

##################################
# 每日一抓:  虎牙粉丝排行榜数据 #
##################################
from urllib.request import urlopen
import sys



#创建(覆盖)文件
def create_file(name):
    f = open(name, "w")
    f.close();

#数据存入文件
def save_data_to_file(fileName, bufUtf8):
    f = open(fileName, "ab")
    f.write(bufUtf8)



#抓取网页数据2 虎牙视频网页,抓取订阅数
#例子: https://v.huya.com/u/1199553057095

def get_subscribe_num_by_idstr(idstr):
    videoBaseUrl = 'https://v.huya.com/u/'
    subscribeMark = "                        <span>订阅:<em>"
    subscribeEndMark = "</em></span>\r\n"
    url = videoBaseUrl + idstr
    cnt = 0;
    result = "0"
    for line in urlopen(url):
        cnt += 1
        #限定有用数据范围,为了节省时间
        if cnt < 180 or cnt > 280 :
            continue
        line_str = line.decode(encoding = "utf-8")
        length = len(line_str);
        if line_str.find(subscribeMark) != -1:
            result = line_str[len(subscribeMark): length - len(subscribeEndMark)]
    return result

#抓取网页数据
#例子: https://www.huya.com/298039

def web_content_pro(url, roomId, fileName):
    anchorMark = "<h3 class=\"host-name\" title="
    anchorEndMark = "</h3>\r\n"
    subscribeMark = "        <div class=\"subscribe-count\" id=\"activityCount\">"
    subscribeEndMark = "</div>\r\n"
    videoMark = "            <a class=\"host-video\" href=\"http://v.huya.com/u/"
    videoEndMark = "\" target=\"_blank\"><i></i><em>视频</em></a>\r\n"
    anchor = ""
    subscribe = "0"
    video = ""
    marka = 0;
    markb = 0;
    markc = 0;
    cnt = 0
    for line in urlopen(url):
        cnt += 1
        #限定有用数据范围,为了节省时间
        if cnt < 100 or cnt > 200 :
            continue 
        line_str = line.decode(encoding = "utf-8")
        length = len(line_str);
        
        #主播名
        if marka == 0 and line_str.find(anchorMark) != -1:
            marka = 1;
            anchor = line_str[line_str.find(">") + 1: length - len(anchorEndMark)]
        #订阅量
        if markb == 0 and  line_str.find(subscribeMark) != -1:
            markb = 1
            subscribe = line_str[len(subscribeMark): length - len(subscribeEndMark)]
        #视频对应的id
        if markc == 0 and line_str.find(videoMark) != -1:
            markc = 1
            video = line_str[len(videoMark): length - len(videoEndMark)] 
        if marka == 1 and markb == 1 and markc == 1 :
            break;
    if(len(anchor) > 0):
        #resultOut = "roomId:" + str(roomId) + "==>anchor:" + anchor + ";subscribe:" + subscribe + ";video:" + video
        endMark = ""
        if subscribe == '0':
            subscribe = get_subscribe_num_by_idstr(video)
            endMark = "\t[alarm:subscribe=0]"
        resultOut = str(roomId) + "\t" + anchor + "\t" + subscribe + "\t" + video + endMark
        print(resultOut);
        resultOut += "\n";
        save_data_to_file(fileName, resultOut.encode(encoding = "utf-8"))
    else:
        
        print("roomId:" + str(roomId) + "==>【未找到】")
    

#抓取数据
def catch_data(fileName, url, start, num):
    for x in range(num):
        urlTmp = urlStd + str(start + x)
        web_content_pro(url + str(start + x), start + x, fileName);
        
        
#并发任务
from multiprocessing import Process
from os import getpid
urlStd = "https://www.huya.com/"
#catchStart = 298039
#catchStart = 521000
#catchNum = 2
fileForSaveData = "data/fansData"

def catch_data_task(index, start, num):
    newName = fileForSaveData + str(index) + ".txt"
    create_file(newName)
    catch_data(newName, urlStd, start, num)
    
#目标文件添加内容
fileForSaveData = "data/fansData"
targetFileName = "data/target.txt"
def target_file_add_content(num):
    fin = open(targetFileName, "ab")
    for x in range(num):   
        newName = fileForSaveData + str(x) + ".txt"
        fout = open(newName, "rb")
        fin.write(fout.read())
        fout.close()
    fin.close()

#主函数
#注:1000个房间号,20个线程,实测耗时 430s 、 380s
def task_start(roomIdStart, roomIdInterval, multiProcessNum, loopTimes):
    '''
    roomIdStart = 2000
    roomIdInterval = 10
    multiProcessNum = 20
    loopTimes = 10
    '''
    import time
    fin = open(targetFileName, "ab")
    title = "\n================= new record ==================\n"
    timestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
    title += "time:" + timestr + "\n";
    title += "roomIdStart:" + str(roomIdStart) + "\nroomIdInterval:" + str(roomIdInterval) + "\nmultiProcessNum:" + str(multiProcessNum) + "\nloopTimes:" + str(loopTimes) + "\n"
    rangeStr = "range[" + str(roomIdStart) + ":" + str(roomIdStart + roomIdInterval * multiProcessNum * loopTimes) + "]"
    title += rangeStr + "\n"
    title += "===============================================\n"
    fin.write(title.encode(encoding="utf8"))
    fin.close()
    
    from time import time
    start = time()
    for loop in range(loopTimes):
        proce = []
        loopStart = roomIdStart + loop * roomIdInterval * multiProcessNum
        for x in range(multiProcessNum):
            proce.append(Process(target=catch_data_task, args=(x, loopStart + (x * roomIdInterval), roomIdInterval )))
            proce[x].start()
        for x in proce:
            x.join()
        target_file_add_content(multiProcessNum)
    end = time()
    timeStr = '==========================\n总共时%.2f秒' % (end - start)
    print(timeStr)
    fin = open(targetFileName, "ab")
    fin.write(timeStr.encode(encoding="utf8"))
    fin.close()
    
def main():
    num = len(sys.argv)
    faild = 0
    errorMsg = "useage: python " + sys.argv[0] + " <roomIdStart[0:10000000]>" + " <roomIdInterval[0:10000]>" + " <multiProcessNum[0:20]>" + " <loopTimes[0:20]>"
    if num <= 4:
        errorMsg += "\nparam num is less than 5, num is " + str(len(sys.argv))
        print(errorMsg)
        faild = 1
    if faild != 1:
        roomIdStart = int(sys.argv[1])
        if roomIdStart < 0 or roomIdStart > 10000000:
            errorMsg += "\nparam roomIdStart is out of range, you input ==>" + sys.argv[1]
            print(errorMsg)
            faild = 1
    if faild != 1:
        roomIdInterval = int(sys.argv[2])
        if roomIdInterval < 0 or roomIdInterval > 10000:
            errorMsg += "\nparam roomIdInterval is out of range, you input ==>" + sys.argv[2]
            print(errorMsg)
            faild = 1
    if faild != 1:
        multiProcessNum = int(sys.argv[3])
        if multiProcessNum < 0 or multiProcessNum > 20:
            errorMsg += "\nparam multiProcessNum is out of range, you input ==>" + sys.argv[3]
            print(errorMsg)
            faild = 1
    if faild != 1:
        loopTimes = int(sys.argv[4])
        if loopTimes < 0 or loopTimes > 20:
            errorMsg += "\nparam loopTimes is out of range, you input ==>" + sys.argv[4]
            print(errorMsg)
            faild = 1
    if faild != 1:
        print("your input ==>\nroomIdStart:%d roomIdInterval:%d multiProcessNum:%d loopTimes:%d\n"%(roomIdStart, roomIdInterval, multiProcessNum, loopTimes))
        task_start(roomIdStart, roomIdInterval, multiProcessNum, loopTimes)
    
#=============================
if __name__ == '__main__':
    main()
    #print(sys.argv[0])
    #print(len(sys.argv))

##################################
#          测       试       :b站用户粉丝数量信息抓取(多次使用可能被短期禁止访问)    #
##################################
from urllib.request import urlopen
from multiprocessing import freeze_support,Lock,Process,Value
import sys

g_var_cnt=Value('i',0)
g_var_lock=Lock()


#创建(覆盖)文件
def create_file(name):
    f = open(name, "w")
    f.close()
    
#向文件写入内容
def write_to_file(name, bufIn):
    f = open(name, "a")
    f.write(bufIn)
    f.close()

#参考: 获取网页隐藏信息  https://blog.csdn.net/qq_38270802/article/details/90204609
#"https://space.bilibili.com/10558188"
#"https://api.bilibili.com/x/space/acc/info?mid=10558188&jsonp=jsonp"
#"https://api.bilibili.com/x/relation/stat?vmid=10558188&jsonp=jsonp"
def get_user_info(totle, fileName, rangeStart, rangeEnd, cnt, lock):
    #fileName = "tmp.txt"
    
    urlForInfoStart = "https://api.bilibili.com/x/space/acc/info?mid="
    urlForInfoEnd = "&jsonp=jsonp"
    name = "[null]"
    nameMarkStart = "\"name\":\""
    nameMarkEnd = "\",\"sex\":\""
    
    urlForStatStart = "https://api.bilibili.com/x/relation/stat?vmid="
    urlForStatEnd = "&jsonp=jsonp"
    following = 0
    followingMarkStart = "\"following\":"
    followingMarkEnd = ",\"whisper\":"
    
    follow = 0
    followMarkStart = "\"follower\":"
    followMarkEnd = "}}"
    
    f = open(fileName, "w")
    
    for x in range(rangeStart, rangeEnd):
        #=================================================================用户昵称
        urlopen("https://api.bilibili.com/x/space/acc/info?mid=10558188&jsonp=jsonp")
        rst = urlopen(urlForInfoStart + str(x) + urlForInfoEnd).read().decode("utf-8")
        if None != rst:
            #用户昵称
            pos1 = rst.find(nameMarkStart)
            pos2 = rst.find(nameMarkEnd)
            if pos1 != -1 and pos2 != -1:
                name = rst[pos1 + len(nameMarkStart):pos2]
        #=================================================================关注和粉丝数
        rst = urlopen(urlForStatStart + str(x) + urlForStatEnd).read().decode("utf-8")
        #print(rst)
        if None != rst:
            #关注数
            pos1 = rst.find(followingMarkStart)
            pos2 = rst.find(followingMarkEnd)
            if pos1 != -1 and pos2 != -1:
                following = int(rst[pos1 + len(followingMarkStart):pos2])
            #粉丝数
            pos1 = rst.find(followMarkStart)
            pos2 = rst.find(followMarkEnd)
            if pos1 != -1 and pos2 != -1:
                follow = int(rst[pos1 + len(followMarkStart):pos2])
        #=================================================================写入文件 
        with lock:
            cnt.value += 1
        #print(cnt.value, totle, cnt.value * 100 / totle, x, name, following, follow)
        outStr = "\r%d%%\t"%(cnt.value * 100 / totle)
        outStr += "%d\t%s\t%d\t%d"%(x, name, following, follow)
        lenTmp = len(outStr)
        for i in range(40 - lenTmp):
            outStr += " "       
        print(outStr, end="")
        bufIn = "%d\t%s\t%d\t%d\n"%(x, name, following, follow) 
        f.write(bufIn)
        
    f.close()
    print("")

def get_user_info_task(dataFileName, loopTimes, loop, index, multiProcessNum, start, interval, cnt, lock):
    print("=======================================>task[%d/%d]%d%%"%(loop + 1, loopTimes, loop * 100 / loopTimes))
    fileName = dataFileName + str(index) + ".txt"
    totle = multiProcessNum * interval
    get_user_info(totle, fileName, start, start + interval, cnt, lock)
    

def target_file_add_content(targetFileName, dataFileName, num):
    fin = open(targetFileName, "ab")
    for x in range(num):   
        newName = dataFileName + str(x) + ".txt"
        fout = open(newName, "rb")
        fin.write(fout.read())
        fout.close()
    fin.close()

def task_start(userIdStart, userIdInterval, multiProcessNum, loopTimes):
    import time
    start = time.time()
    dataFileName = "data/biliData"
    targetFileName = "data/target.txt"
    
    fin = open(targetFileName, "ab")
    title = "\n================= new record ==================\n"
    timestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
    title += "time:" + timestr + "\n";
    title += "userIdStart:" + str(userIdStart) + "\nuserIdInterval:" + str(userIdInterval) + "\nmultiProcessNum:" + str(multiProcessNum) + "\nloopTimes:" + str(loopTimes) + "\n"
    rangeStr = "range[" + str(userIdStart) + ":" + str(userIdStart + userIdInterval * multiProcessNum * loopTimes) + "]"
    title += rangeStr + "\n"
    title += "===============================================\n"
    fin.write(title.encode(encoding="utf8"))
    fin.close()
  
    for loop in range(loopTimes):
        proce = []
        g_var_cnt.value = 0
        loopStart = userIdStart + loop * userIdInterval * multiProcessNum
        proce=[Process(target=get_user_info_task, args=(dataFileName, loopTimes, loop, i, multiProcessNum, loopStart + (i * userIdInterval), userIdInterval, g_var_cnt, g_var_lock,)) for i in range(multiProcessNum)]
        for x in proce:
            x.start()
        for x in proce:
            x.join()
        target_file_add_content(targetFileName, dataFileName, multiProcessNum)
    end = time.time()
    timeStr = '\n==========================\n总共时%.2f秒, 搜索%d位用户[%d:%d], 实际搜索%d(访问可能被禁止)' % (end - start, multiProcessNum * loopTimes * userIdInterval, userIdStart, userIdStart + userIdInterval * multiProcessNum * loopTimes, g_var_cnt.value)
    print(timeStr)
    fin = open(targetFileName, "a")
    fin.write(timeStr)
    fin.close()


def main():
    num = len(sys.argv)
    faild = 0
    errorMsg = "useage: python " + sys.argv[0] + " <userIdStart[0:100000000]>" + " <userIdInterval[0:10000]>" + " <multiProcessNum[0:20]>" + " <loopTimes[0:20]>"
    if num <= 4:
        errorMsg += "\nparam num is less than 5, num is " + str(len(sys.argv))
        print(errorMsg)
        faild = 1
    if faild != 1:
        userIdStart = int(sys.argv[1])
        if userIdStart < 0 or userIdStart > 100000000:
            errorMsg += "\nparam userIdStart is out of range, you input ==>" + sys.argv[1]
            print(errorMsg)
            faild = 1
    if faild != 1:
        userIdInterval = int(sys.argv[2])
        if userIdInterval < 0 or userIdInterval > 10000:
            errorMsg += "\nparam userIdInterval is out of range, you input ==>" + sys.argv[2]
            print(errorMsg)
            faild = 1
    if faild != 1:
        multiProcessNum = int(sys.argv[3])
        if multiProcessNum < 0 or multiProcessNum > 20:
            errorMsg += "\nparam multiProcessNum is out of range, you input ==>" + sys.argv[3]
            print(errorMsg)
            faild = 1
    if faild != 1:
        loopTimes = int(sys.argv[4])
        if loopTimes < 0 or loopTimes > 20:
            errorMsg += "\nparam loopTimes is out of range, you input ==>" + sys.argv[4]
            print(errorMsg)
            faild = 1
    if faild != 1:
        print("============================================================")
        print("                       task  start                          ")
        print("userIdStart:%d userIdInterval:%d multiProcessNum:%d loopTimes:%d"%(userIdStart, userIdInterval, multiProcessNum, loopTimes))
        print("============================================================")
        task_start(userIdStart, userIdInterval, multiProcessNum, loopTimes)


if __name__ == '__main__':
    main()

上一篇下一篇

猜你喜欢

热点阅读