虎牙、B站网页信息python抓取试试
2021-08-24 本文已影响0人
千转军师
利用虎牙和b站的网页来抓取用户及其粉丝数量
使用时要注意:
(1)cmd 命令下 python test.py
后面有参数,如果没敲参数,会有提示,如:
useage: python test.py <roomIdStart[0:10000000]> <roomIdInterval[0:10000]> <multiProcessNum[0:20]> <loopTimes[0:20]>
param num is less than 5, num is 1
含义是:起始房间号、搜索组的大小、线程数、循环次数
搜索的总共房间数 = 搜索组的大小 * 线程数 * 循环次数
(2)结果
结果放在文件 data/target.txt
注意事先创建文件夹 data,否则提示没有 data/target.txt
(3)例子(虎牙):
================= new record ==================
time:2022-02-20 10:18:55
roomIdStart:1000
roomIdInterval:10
multiProcessNum:18
loopTimes:2
range[1000:1360]
===============================================
1066 电子厂-心态 7945900 1539218884
1123 奇领颜韵Ycy【万徒】 117598 1199552286636
==========================
总共时18.54秒
##################################
# 每日一抓: 虎牙粉丝排行榜数据 #
##################################
from urllib.request import urlopen
import sys
#创建(覆盖)文件
def create_file(name):
f = open(name, "w")
f.close();
#数据存入文件
def save_data_to_file(fileName, bufUtf8):
f = open(fileName, "ab")
f.write(bufUtf8)
#抓取网页数据2 虎牙视频网页,抓取订阅数
#例子: https://v.huya.com/u/1199553057095
def get_subscribe_num_by_idstr(idstr):
videoBaseUrl = 'https://v.huya.com/u/'
subscribeMark = " <span>订阅:<em>"
subscribeEndMark = "</em></span>\r\n"
url = videoBaseUrl + idstr
cnt = 0;
result = "0"
for line in urlopen(url):
cnt += 1
#限定有用数据范围,为了节省时间
if cnt < 180 or cnt > 280 :
continue
line_str = line.decode(encoding = "utf-8")
length = len(line_str);
if line_str.find(subscribeMark) != -1:
result = line_str[len(subscribeMark): length - len(subscribeEndMark)]
return result
#抓取网页数据
#例子: https://www.huya.com/298039
def web_content_pro(url, roomId, fileName):
anchorMark = "<h3 class=\"host-name\" title="
anchorEndMark = "</h3>\r\n"
subscribeMark = " <div class=\"subscribe-count\" id=\"activityCount\">"
subscribeEndMark = "</div>\r\n"
videoMark = " <a class=\"host-video\" href=\"http://v.huya.com/u/"
videoEndMark = "\" target=\"_blank\"><i></i><em>视频</em></a>\r\n"
anchor = ""
subscribe = "0"
video = ""
marka = 0;
markb = 0;
markc = 0;
cnt = 0
for line in urlopen(url):
cnt += 1
#限定有用数据范围,为了节省时间
if cnt < 100 or cnt > 200 :
continue
line_str = line.decode(encoding = "utf-8")
length = len(line_str);
#主播名
if marka == 0 and line_str.find(anchorMark) != -1:
marka = 1;
anchor = line_str[line_str.find(">") + 1: length - len(anchorEndMark)]
#订阅量
if markb == 0 and line_str.find(subscribeMark) != -1:
markb = 1
subscribe = line_str[len(subscribeMark): length - len(subscribeEndMark)]
#视频对应的id
if markc == 0 and line_str.find(videoMark) != -1:
markc = 1
video = line_str[len(videoMark): length - len(videoEndMark)]
if marka == 1 and markb == 1 and markc == 1 :
break;
if(len(anchor) > 0):
#resultOut = "roomId:" + str(roomId) + "==>anchor:" + anchor + ";subscribe:" + subscribe + ";video:" + video
endMark = ""
if subscribe == '0':
subscribe = get_subscribe_num_by_idstr(video)
endMark = "\t[alarm:subscribe=0]"
resultOut = str(roomId) + "\t" + anchor + "\t" + subscribe + "\t" + video + endMark
print(resultOut);
resultOut += "\n";
save_data_to_file(fileName, resultOut.encode(encoding = "utf-8"))
else:
print("roomId:" + str(roomId) + "==>【未找到】")
#抓取数据
def catch_data(fileName, url, start, num):
for x in range(num):
urlTmp = urlStd + str(start + x)
web_content_pro(url + str(start + x), start + x, fileName);
#并发任务
from multiprocessing import Process
from os import getpid
urlStd = "https://www.huya.com/"
#catchStart = 298039
#catchStart = 521000
#catchNum = 2
fileForSaveData = "data/fansData"
def catch_data_task(index, start, num):
newName = fileForSaveData + str(index) + ".txt"
create_file(newName)
catch_data(newName, urlStd, start, num)
#目标文件添加内容
fileForSaveData = "data/fansData"
targetFileName = "data/target.txt"
def target_file_add_content(num):
fin = open(targetFileName, "ab")
for x in range(num):
newName = fileForSaveData + str(x) + ".txt"
fout = open(newName, "rb")
fin.write(fout.read())
fout.close()
fin.close()
#主函数
#注:1000个房间号,20个线程,实测耗时 430s 、 380s
def task_start(roomIdStart, roomIdInterval, multiProcessNum, loopTimes):
'''
roomIdStart = 2000
roomIdInterval = 10
multiProcessNum = 20
loopTimes = 10
'''
import time
fin = open(targetFileName, "ab")
title = "\n================= new record ==================\n"
timestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
title += "time:" + timestr + "\n";
title += "roomIdStart:" + str(roomIdStart) + "\nroomIdInterval:" + str(roomIdInterval) + "\nmultiProcessNum:" + str(multiProcessNum) + "\nloopTimes:" + str(loopTimes) + "\n"
rangeStr = "range[" + str(roomIdStart) + ":" + str(roomIdStart + roomIdInterval * multiProcessNum * loopTimes) + "]"
title += rangeStr + "\n"
title += "===============================================\n"
fin.write(title.encode(encoding="utf8"))
fin.close()
from time import time
start = time()
for loop in range(loopTimes):
proce = []
loopStart = roomIdStart + loop * roomIdInterval * multiProcessNum
for x in range(multiProcessNum):
proce.append(Process(target=catch_data_task, args=(x, loopStart + (x * roomIdInterval), roomIdInterval )))
proce[x].start()
for x in proce:
x.join()
target_file_add_content(multiProcessNum)
end = time()
timeStr = '==========================\n总共时%.2f秒' % (end - start)
print(timeStr)
fin = open(targetFileName, "ab")
fin.write(timeStr.encode(encoding="utf8"))
fin.close()
def main():
num = len(sys.argv)
faild = 0
errorMsg = "useage: python " + sys.argv[0] + " <roomIdStart[0:10000000]>" + " <roomIdInterval[0:10000]>" + " <multiProcessNum[0:20]>" + " <loopTimes[0:20]>"
if num <= 4:
errorMsg += "\nparam num is less than 5, num is " + str(len(sys.argv))
print(errorMsg)
faild = 1
if faild != 1:
roomIdStart = int(sys.argv[1])
if roomIdStart < 0 or roomIdStart > 10000000:
errorMsg += "\nparam roomIdStart is out of range, you input ==>" + sys.argv[1]
print(errorMsg)
faild = 1
if faild != 1:
roomIdInterval = int(sys.argv[2])
if roomIdInterval < 0 or roomIdInterval > 10000:
errorMsg += "\nparam roomIdInterval is out of range, you input ==>" + sys.argv[2]
print(errorMsg)
faild = 1
if faild != 1:
multiProcessNum = int(sys.argv[3])
if multiProcessNum < 0 or multiProcessNum > 20:
errorMsg += "\nparam multiProcessNum is out of range, you input ==>" + sys.argv[3]
print(errorMsg)
faild = 1
if faild != 1:
loopTimes = int(sys.argv[4])
if loopTimes < 0 or loopTimes > 20:
errorMsg += "\nparam loopTimes is out of range, you input ==>" + sys.argv[4]
print(errorMsg)
faild = 1
if faild != 1:
print("your input ==>\nroomIdStart:%d roomIdInterval:%d multiProcessNum:%d loopTimes:%d\n"%(roomIdStart, roomIdInterval, multiProcessNum, loopTimes))
task_start(roomIdStart, roomIdInterval, multiProcessNum, loopTimes)
#=============================
if __name__ == '__main__':
main()
#print(sys.argv[0])
#print(len(sys.argv))
##################################
# 测 试 :b站用户粉丝数量信息抓取(多次使用可能被短期禁止访问) #
##################################
from urllib.request import urlopen
from multiprocessing import freeze_support,Lock,Process,Value
import sys
g_var_cnt=Value('i',0)
g_var_lock=Lock()
#创建(覆盖)文件
def create_file(name):
f = open(name, "w")
f.close()
#向文件写入内容
def write_to_file(name, bufIn):
f = open(name, "a")
f.write(bufIn)
f.close()
#参考: 获取网页隐藏信息 https://blog.csdn.net/qq_38270802/article/details/90204609
#"https://space.bilibili.com/10558188"
#"https://api.bilibili.com/x/space/acc/info?mid=10558188&jsonp=jsonp"
#"https://api.bilibili.com/x/relation/stat?vmid=10558188&jsonp=jsonp"
def get_user_info(totle, fileName, rangeStart, rangeEnd, cnt, lock):
#fileName = "tmp.txt"
urlForInfoStart = "https://api.bilibili.com/x/space/acc/info?mid="
urlForInfoEnd = "&jsonp=jsonp"
name = "[null]"
nameMarkStart = "\"name\":\""
nameMarkEnd = "\",\"sex\":\""
urlForStatStart = "https://api.bilibili.com/x/relation/stat?vmid="
urlForStatEnd = "&jsonp=jsonp"
following = 0
followingMarkStart = "\"following\":"
followingMarkEnd = ",\"whisper\":"
follow = 0
followMarkStart = "\"follower\":"
followMarkEnd = "}}"
f = open(fileName, "w")
for x in range(rangeStart, rangeEnd):
#=================================================================用户昵称
urlopen("https://api.bilibili.com/x/space/acc/info?mid=10558188&jsonp=jsonp")
rst = urlopen(urlForInfoStart + str(x) + urlForInfoEnd).read().decode("utf-8")
if None != rst:
#用户昵称
pos1 = rst.find(nameMarkStart)
pos2 = rst.find(nameMarkEnd)
if pos1 != -1 and pos2 != -1:
name = rst[pos1 + len(nameMarkStart):pos2]
#=================================================================关注和粉丝数
rst = urlopen(urlForStatStart + str(x) + urlForStatEnd).read().decode("utf-8")
#print(rst)
if None != rst:
#关注数
pos1 = rst.find(followingMarkStart)
pos2 = rst.find(followingMarkEnd)
if pos1 != -1 and pos2 != -1:
following = int(rst[pos1 + len(followingMarkStart):pos2])
#粉丝数
pos1 = rst.find(followMarkStart)
pos2 = rst.find(followMarkEnd)
if pos1 != -1 and pos2 != -1:
follow = int(rst[pos1 + len(followMarkStart):pos2])
#=================================================================写入文件
with lock:
cnt.value += 1
#print(cnt.value, totle, cnt.value * 100 / totle, x, name, following, follow)
outStr = "\r%d%%\t"%(cnt.value * 100 / totle)
outStr += "%d\t%s\t%d\t%d"%(x, name, following, follow)
lenTmp = len(outStr)
for i in range(40 - lenTmp):
outStr += " "
print(outStr, end="")
bufIn = "%d\t%s\t%d\t%d\n"%(x, name, following, follow)
f.write(bufIn)
f.close()
print("")
def get_user_info_task(dataFileName, loopTimes, loop, index, multiProcessNum, start, interval, cnt, lock):
print("=======================================>task[%d/%d]%d%%"%(loop + 1, loopTimes, loop * 100 / loopTimes))
fileName = dataFileName + str(index) + ".txt"
totle = multiProcessNum * interval
get_user_info(totle, fileName, start, start + interval, cnt, lock)
def target_file_add_content(targetFileName, dataFileName, num):
fin = open(targetFileName, "ab")
for x in range(num):
newName = dataFileName + str(x) + ".txt"
fout = open(newName, "rb")
fin.write(fout.read())
fout.close()
fin.close()
def task_start(userIdStart, userIdInterval, multiProcessNum, loopTimes):
import time
start = time.time()
dataFileName = "data/biliData"
targetFileName = "data/target.txt"
fin = open(targetFileName, "ab")
title = "\n================= new record ==================\n"
timestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
title += "time:" + timestr + "\n";
title += "userIdStart:" + str(userIdStart) + "\nuserIdInterval:" + str(userIdInterval) + "\nmultiProcessNum:" + str(multiProcessNum) + "\nloopTimes:" + str(loopTimes) + "\n"
rangeStr = "range[" + str(userIdStart) + ":" + str(userIdStart + userIdInterval * multiProcessNum * loopTimes) + "]"
title += rangeStr + "\n"
title += "===============================================\n"
fin.write(title.encode(encoding="utf8"))
fin.close()
for loop in range(loopTimes):
proce = []
g_var_cnt.value = 0
loopStart = userIdStart + loop * userIdInterval * multiProcessNum
proce=[Process(target=get_user_info_task, args=(dataFileName, loopTimes, loop, i, multiProcessNum, loopStart + (i * userIdInterval), userIdInterval, g_var_cnt, g_var_lock,)) for i in range(multiProcessNum)]
for x in proce:
x.start()
for x in proce:
x.join()
target_file_add_content(targetFileName, dataFileName, multiProcessNum)
end = time.time()
timeStr = '\n==========================\n总共时%.2f秒, 搜索%d位用户[%d:%d], 实际搜索%d(访问可能被禁止)' % (end - start, multiProcessNum * loopTimes * userIdInterval, userIdStart, userIdStart + userIdInterval * multiProcessNum * loopTimes, g_var_cnt.value)
print(timeStr)
fin = open(targetFileName, "a")
fin.write(timeStr)
fin.close()
def main():
num = len(sys.argv)
faild = 0
errorMsg = "useage: python " + sys.argv[0] + " <userIdStart[0:100000000]>" + " <userIdInterval[0:10000]>" + " <multiProcessNum[0:20]>" + " <loopTimes[0:20]>"
if num <= 4:
errorMsg += "\nparam num is less than 5, num is " + str(len(sys.argv))
print(errorMsg)
faild = 1
if faild != 1:
userIdStart = int(sys.argv[1])
if userIdStart < 0 or userIdStart > 100000000:
errorMsg += "\nparam userIdStart is out of range, you input ==>" + sys.argv[1]
print(errorMsg)
faild = 1
if faild != 1:
userIdInterval = int(sys.argv[2])
if userIdInterval < 0 or userIdInterval > 10000:
errorMsg += "\nparam userIdInterval is out of range, you input ==>" + sys.argv[2]
print(errorMsg)
faild = 1
if faild != 1:
multiProcessNum = int(sys.argv[3])
if multiProcessNum < 0 or multiProcessNum > 20:
errorMsg += "\nparam multiProcessNum is out of range, you input ==>" + sys.argv[3]
print(errorMsg)
faild = 1
if faild != 1:
loopTimes = int(sys.argv[4])
if loopTimes < 0 or loopTimes > 20:
errorMsg += "\nparam loopTimes is out of range, you input ==>" + sys.argv[4]
print(errorMsg)
faild = 1
if faild != 1:
print("============================================================")
print(" task start ")
print("userIdStart:%d userIdInterval:%d multiProcessNum:%d loopTimes:%d"%(userIdStart, userIdInterval, multiProcessNum, loopTimes))
print("============================================================")
task_start(userIdStart, userIdInterval, multiProcessNum, loopTimes)
if __name__ == '__main__':
main()