nodejs 爬视频初探
2021-05-08 本文已影响0人
yes先生boss
直接上代码
var cheerio = require("cheerio");
var fs = require('fs');
var async = require("async");
const superagent = require('superagent')
const request = require('request')
const mkdirp = require('mkdirp')
const path = require('path')
var options = {
uri: 'http://xxxxx', //要爬的视频网站
dirfile: './output/', //保存目录
downLimit: 2//视频并行下载上限
}, prolist = [], videolist = [];
down(options.uri)
async function down(url) {
//首页列表
await new Promise((resolve) => {
superagent
.get(url)
.end((error, response) => {
if (!error && response.statusCode == 200) {
//获取页面文档数据
var $ = cheerio.load(response.text, {
normalizeWhitespace: true,
decodeEntities: false
});
//这一步需要去网站界面来查看elements
$(".comapny-card.bg-fff.div-animationone").each((i, obj) => {
let json = {
dir: $(obj).find("h6").text(),
title: $(obj).find("a").attr("title"),
url: $(obj).find("a").attr("href")
}
prolist.push(json)
})
resolve()
}
})
})
// 详情
for (var opt of prolist) {
await mkdir(opt.dir);
await new Promise((resolve) => {
//这是一个小坑,需要来模拟浏览器,添加header就可以
superagent
.get(opt.url)
.set("Connection", "keep-alive")
.set("Content-Length", 0)
.set("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8")
.set("User-Agent", "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Mobile Safari/537.36")
.end((error, response) => {
// 获取具体视频的详情界面(获取视频路径)
if (!error && response.statusCode == 200) {
var $ = cheerio.load(response.text, {
normalizeWhitespace: true,
decodeEntities: false
});
$("#List1_1 .video_name1").each((i, obj) => {
let json = {
title: $(obj).attr("title"),
url: $(obj).attr("rel")
}
videolist.push(json)
})
resolve()
}
})
})
}
await sleep(2000);
// 下载视频
if (videolist?.length) {
await downliu(opt.dir, videolist, function () {
console.log('下载结束');
})
}
}
/**
* 创建视频保存的目录
*/
function mkdir(title) {
console.log('创建目录:%s', title);
if (!fs.existsSync(options.dirfile + title)) {
mkdirp(options.dirfile + title, function (err) {
console.log(`目录:${title} 创建成功`);
});
}
}
function sleep(duration) {
return new Promise((resolve, reject) => {
setTimeout(resolve, duration);
});
};
/**
* 下载视频
*/
function downliu(dir, links, callback) {
console.log(`发现${links.length}个视频,准备开始下载...`);
async.eachLimit(links, options.downLimit, function (video, callback) {
// 获取url最后的名字
var fileName = path.basename(video.title).replace(/ /g, '');
// 去掉/
var toPath = path.join(options.dirfile + dir, fileName);
console.log(`开始下载视频:${fileName},保存到:${dir}`);
request(encodeURI(video.url)).on('error', function (err) {
callback();
}).pipe(fs.createWriteStream(toPath + ".mp4")).on('finish', () => {
console.log(`视频下载成功:${video.url}`);
callback();
})
}, callback);
}